mirror of https://github.com/apache/lucene.git
LUCENE-7590: add DocValuesStatsCollector
This commit is contained in:
parent
73965bad07
commit
43f4f7a279
|
@ -0,0 +1,165 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
import org.apache.lucene.index.NumericDocValues;
|
||||||
|
|
||||||
|
/** Holds statistics for a DocValues field. */
|
||||||
|
public abstract class DocValuesStats<T> {
|
||||||
|
|
||||||
|
private int missing = 0;
|
||||||
|
private int count = 0;
|
||||||
|
|
||||||
|
protected final String field;
|
||||||
|
|
||||||
|
protected T min;
|
||||||
|
protected T max;
|
||||||
|
|
||||||
|
protected DocValuesStats(String field, T initialMin, T initialMax) {
|
||||||
|
this.field = field;
|
||||||
|
this.min = initialMin;
|
||||||
|
this.max = initialMax;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called after #{@link DocValuesStats#accumulate(int)} was processed and verified that the document has a value for
|
||||||
|
* the field. Implementations should update the statistics based on the value of the current document.
|
||||||
|
*
|
||||||
|
* @param count
|
||||||
|
* the updated number of documents with value for this field.
|
||||||
|
*/
|
||||||
|
protected abstract void doAccumulate(int count) throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initializes this object with the given reader context. Returns whether stats can be computed for this segment (i.e.
|
||||||
|
* it does have the requested DocValues field).
|
||||||
|
*/
|
||||||
|
protected abstract boolean init(LeafReaderContext contxt) throws IOException;
|
||||||
|
|
||||||
|
/** Returns whether the given document has a value for the requested DocValues field. */
|
||||||
|
protected abstract boolean hasValue(int doc) throws IOException;
|
||||||
|
|
||||||
|
final void accumulate(int doc) throws IOException {
|
||||||
|
if (hasValue(doc)) {
|
||||||
|
++count;
|
||||||
|
doAccumulate(count);
|
||||||
|
} else {
|
||||||
|
++missing;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final void addMissing() {
|
||||||
|
++missing;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** The field for which these stats were computed. */
|
||||||
|
public final String field() {
|
||||||
|
return field;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** The number of documents which have a value of the field. */
|
||||||
|
public final int count() {
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** The number of documents which do not have a value of the field. */
|
||||||
|
public final int missing() {
|
||||||
|
return missing;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** The minimum value of the field. Undefined when {@link #count} is zero. */
|
||||||
|
public final T min() {
|
||||||
|
return min;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** The maximum value of the field. Undefined when {@link #count} is zero. */
|
||||||
|
public final T max() {
|
||||||
|
return max;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Holds statistics for a numeric DocValues field. */
|
||||||
|
public static abstract class NumericDocValuesStats<T extends Number> extends DocValuesStats<T> {
|
||||||
|
|
||||||
|
protected double mean = 0.0;
|
||||||
|
|
||||||
|
protected NumericDocValues ndv;
|
||||||
|
|
||||||
|
protected NumericDocValuesStats(String field, T initialMin, T initialMax) {
|
||||||
|
super(field, initialMin, initialMax);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final boolean init(LeafReaderContext contxt) throws IOException {
|
||||||
|
ndv = contxt.reader().getNumericDocValues(field);
|
||||||
|
return ndv != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected boolean hasValue(int doc) throws IOException {
|
||||||
|
return ndv.advanceExact(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** The mean of all values of the field. Undefined when {@link #count} is zero. */
|
||||||
|
public final double mean() {
|
||||||
|
return mean;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Holds DocValues statistics for a numeric field storing {@code long} values. */
|
||||||
|
public static final class LongDocValuesStats extends NumericDocValuesStats<Long> {
|
||||||
|
|
||||||
|
public LongDocValuesStats(String description) {
|
||||||
|
super(description, Long.MAX_VALUE, Long.MIN_VALUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void doAccumulate(int count) throws IOException {
|
||||||
|
long val = ndv.longValue();
|
||||||
|
if (val > max) {
|
||||||
|
max = val;
|
||||||
|
}
|
||||||
|
if (val < min) {
|
||||||
|
min = val;
|
||||||
|
}
|
||||||
|
mean += (val - mean) / count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Holds DocValues statistics for a numeric field storing {@code double} values. */
|
||||||
|
public static final class DoubleDocValuesStats extends NumericDocValuesStats<Double> {
|
||||||
|
|
||||||
|
public DoubleDocValuesStats(String description) {
|
||||||
|
super(description, Double.MAX_VALUE, Double.MIN_VALUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void doAccumulate(int count) throws IOException {
|
||||||
|
double val = Double.longBitsToDouble(ndv.longValue());
|
||||||
|
if (Double.compare(val, max) > 0) {
|
||||||
|
max = val;
|
||||||
|
}
|
||||||
|
if (Double.compare(val, min) < 0) {
|
||||||
|
min = val;
|
||||||
|
}
|
||||||
|
mean += (val - mean) / count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,64 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
|
||||||
|
/** A {@link Collector} which computes statistics for a DocValues field. */
|
||||||
|
public class DocValuesStatsCollector implements Collector {
|
||||||
|
|
||||||
|
private final DocValuesStats<?> stats;
|
||||||
|
|
||||||
|
/** Creates a collector to compute statistics for a DocValues field using the given {@code stats}. */
|
||||||
|
public DocValuesStatsCollector(DocValuesStats<?> stats) {
|
||||||
|
this.stats = stats;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
|
||||||
|
boolean shouldProcess = stats.init(context);
|
||||||
|
if (!shouldProcess) {
|
||||||
|
// Stats cannot be computed for this segment, therefore consider all matching documents as a 'miss'.
|
||||||
|
return new LeafCollector() {
|
||||||
|
@Override public void setScorer(Scorer scorer) throws IOException {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void collect(int doc) throws IOException {
|
||||||
|
// All matching documents in this reader are missing a value
|
||||||
|
stats.addMissing();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return new LeafCollector() {
|
||||||
|
@Override public void setScorer(Scorer scorer) throws IOException {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void collect(int doc) throws IOException {
|
||||||
|
stats.accumulate(doc);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean needsScores() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,166 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.stream.DoubleStream;
|
||||||
|
import java.util.stream.LongStream;
|
||||||
|
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.DoubleDocValuesField;
|
||||||
|
import org.apache.lucene.document.Field.Store;
|
||||||
|
import org.apache.lucene.document.NumericDocValuesField;
|
||||||
|
import org.apache.lucene.document.StringField;
|
||||||
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.DocValuesStats.DoubleDocValuesStats;
|
||||||
|
import org.apache.lucene.search.DocValuesStats.LongDocValuesStats;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
|
/** Unit tests for {@link DocValuesStatsCollector}. */
|
||||||
|
public class TestDocValuesStatsCollector extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testNoDocsWithField() throws IOException {
|
||||||
|
try (Directory dir = newDirectory();
|
||||||
|
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
|
||||||
|
int numDocs = TestUtil.nextInt(random(), 1, 100);
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
indexWriter.addDocument(new Document());
|
||||||
|
}
|
||||||
|
|
||||||
|
try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
|
||||||
|
IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
LongDocValuesStats stats = new LongDocValuesStats("foo");
|
||||||
|
searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
|
||||||
|
|
||||||
|
assertEquals(0, stats.count());
|
||||||
|
assertEquals(numDocs, stats.missing());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandomDocsWithLongValues() throws IOException {
|
||||||
|
try (Directory dir = newDirectory();
|
||||||
|
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
|
||||||
|
String field = "numeric";
|
||||||
|
int numDocs = TestUtil.nextInt(random(), 1, 100);
|
||||||
|
long[] docValues = new long[numDocs];
|
||||||
|
int nextVal = 1;
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
if (random().nextBoolean()) { // not all documents have a value
|
||||||
|
doc.add(new NumericDocValuesField(field, nextVal));
|
||||||
|
doc.add(new StringField("id", "doc" + i, Store.NO));
|
||||||
|
docValues[i] = nextVal;
|
||||||
|
++nextVal;
|
||||||
|
}
|
||||||
|
indexWriter.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 20% of cases delete some docs
|
||||||
|
if (random().nextDouble() < 0.2) {
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
indexWriter.deleteDocuments(new Term("id", "doc" + i));
|
||||||
|
docValues[i] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
|
||||||
|
IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
LongDocValuesStats stats = new LongDocValuesStats(field);
|
||||||
|
searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
|
||||||
|
|
||||||
|
int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count();
|
||||||
|
assertEquals(expCount, stats.count());
|
||||||
|
assertEquals(getZeroValues(docValues).count() - reader.numDeletedDocs(), stats.missing());
|
||||||
|
if (stats.count() > 0) {
|
||||||
|
assertEquals(getPositiveValues(docValues).max().getAsLong(), stats.max().longValue());
|
||||||
|
assertEquals(getPositiveValues(docValues).min().getAsLong(), stats.min().longValue());
|
||||||
|
assertEquals(getPositiveValues(docValues).average().getAsDouble(), stats.mean(), 0.00001);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandomDocsWithDoubleValues() throws IOException {
|
||||||
|
try (Directory dir = newDirectory();
|
||||||
|
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
|
||||||
|
String field = "numeric";
|
||||||
|
int numDocs = TestUtil.nextInt(random(), 1, 100);
|
||||||
|
double[] docValues = new double[numDocs];
|
||||||
|
double nextVal = 1.0;
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
if (random().nextBoolean()) { // not all documents have a value
|
||||||
|
doc.add(new DoubleDocValuesField(field, nextVal));
|
||||||
|
doc.add(new StringField("id", "doc" + i, Store.NO));
|
||||||
|
docValues[i] = nextVal;
|
||||||
|
++nextVal;
|
||||||
|
}
|
||||||
|
indexWriter.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 20% of cases delete some docs
|
||||||
|
if (random().nextDouble() < 0.2) {
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
indexWriter.deleteDocuments(new Term("id", "doc" + i));
|
||||||
|
docValues[i] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
|
||||||
|
IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
DoubleDocValuesStats stats = new DoubleDocValuesStats(field);
|
||||||
|
searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
|
||||||
|
|
||||||
|
int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count();
|
||||||
|
assertEquals(expCount, stats.count());
|
||||||
|
assertEquals(getZeroValues(docValues).count() - reader.numDeletedDocs(), stats.missing());
|
||||||
|
if (stats.count() > 0) {
|
||||||
|
assertEquals(getPositiveValues(docValues).max().getAsDouble(), stats.max().doubleValue(), 0.00001);
|
||||||
|
assertEquals(getPositiveValues(docValues).min().getAsDouble(), stats.min().doubleValue(), 0.00001);
|
||||||
|
assertEquals(getPositiveValues(docValues).average().getAsDouble(), stats.mean(), 0.00001);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static LongStream getPositiveValues(long[] docValues) {
|
||||||
|
return Arrays.stream(docValues).filter(v -> v > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static DoubleStream getPositiveValues(double[] docValues) {
|
||||||
|
return Arrays.stream(docValues).filter(v -> v > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static LongStream getZeroValues(long[] docValues) {
|
||||||
|
return Arrays.stream(docValues).filter(v -> v == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static DoubleStream getZeroValues(double[] docValues) {
|
||||||
|
return Arrays.stream(docValues).filter(v -> v == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue