diff --git a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java new file mode 100644 index 00000000000..fad9f97f0e2 --- /dev/null +++ b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; + +/** Holds statistics for a DocValues field. */ +public abstract class DocValuesStats { + + private int missing = 0; + private int count = 0; + + protected final String field; + + protected T min; + protected T max; + + protected DocValuesStats(String field, T initialMin, T initialMax) { + this.field = field; + this.min = initialMin; + this.max = initialMax; + } + + /** + * Called after #{@link DocValuesStats#accumulate(int)} was processed and verified that the document has a value for + * the field. Implementations should update the statistics based on the value of the current document. + * + * @param count + * the updated number of documents with value for this field. + */ + protected abstract void doAccumulate(int count) throws IOException; + + /** + * Initializes this object with the given reader context. Returns whether stats can be computed for this segment (i.e. + * it does have the requested DocValues field). + */ + protected abstract boolean init(LeafReaderContext contxt) throws IOException; + + /** Returns whether the given document has a value for the requested DocValues field. */ + protected abstract boolean hasValue(int doc) throws IOException; + + final void accumulate(int doc) throws IOException { + if (hasValue(doc)) { + ++count; + doAccumulate(count); + } else { + ++missing; + } + } + + final void addMissing() { + ++missing; + } + + /** The field for which these stats were computed. */ + public final String field() { + return field; + } + + /** The number of documents which have a value of the field. */ + public final int count() { + return count; + } + + /** The number of documents which do not have a value of the field. */ + public final int missing() { + return missing; + } + + /** The minimum value of the field. Undefined when {@link #count} is zero. */ + public final T min() { + return min; + } + + /** The maximum value of the field. Undefined when {@link #count} is zero. */ + public final T max() { + return max; + } + + /** Holds statistics for a numeric DocValues field. */ + public static abstract class NumericDocValuesStats extends DocValuesStats { + + protected double mean = 0.0; + + protected NumericDocValues ndv; + + protected NumericDocValuesStats(String field, T initialMin, T initialMax) { + super(field, initialMin, initialMax); + } + + @Override + protected final boolean init(LeafReaderContext contxt) throws IOException { + ndv = contxt.reader().getNumericDocValues(field); + return ndv != null; + } + + @Override + protected boolean hasValue(int doc) throws IOException { + return ndv.advanceExact(doc); + } + + /** The mean of all values of the field. Undefined when {@link #count} is zero. */ + public final double mean() { + return mean; + } + } + + /** Holds DocValues statistics for a numeric field storing {@code long} values. */ + public static final class LongDocValuesStats extends NumericDocValuesStats { + + public LongDocValuesStats(String description) { + super(description, Long.MAX_VALUE, Long.MIN_VALUE); + } + + @Override + protected void doAccumulate(int count) throws IOException { + long val = ndv.longValue(); + if (val > max) { + max = val; + } + if (val < min) { + min = val; + } + mean += (val - mean) / count; + } + } + + /** Holds DocValues statistics for a numeric field storing {@code double} values. */ + public static final class DoubleDocValuesStats extends NumericDocValuesStats { + + public DoubleDocValuesStats(String description) { + super(description, Double.MAX_VALUE, Double.MIN_VALUE); + } + + @Override + protected void doAccumulate(int count) throws IOException { + double val = Double.longBitsToDouble(ndv.longValue()); + if (Double.compare(val, max) > 0) { + max = val; + } + if (Double.compare(val, min) < 0) { + min = val; + } + mean += (val - mean) / count; + } + } + +} \ No newline at end of file diff --git a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStatsCollector.java b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStatsCollector.java new file mode 100644 index 00000000000..2b1fa4fb852 --- /dev/null +++ b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStatsCollector.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; + +import org.apache.lucene.index.LeafReaderContext; + +/** A {@link Collector} which computes statistics for a DocValues field. */ +public class DocValuesStatsCollector implements Collector { + + private final DocValuesStats stats; + + /** Creates a collector to compute statistics for a DocValues field using the given {@code stats}. */ + public DocValuesStatsCollector(DocValuesStats stats) { + this.stats = stats; + } + + @Override + public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { + boolean shouldProcess = stats.init(context); + if (!shouldProcess) { + // Stats cannot be computed for this segment, therefore consider all matching documents as a 'miss'. + return new LeafCollector() { + @Override public void setScorer(Scorer scorer) throws IOException {} + + @Override + public void collect(int doc) throws IOException { + // All matching documents in this reader are missing a value + stats.addMissing(); + } + }; + } + + return new LeafCollector() { + @Override public void setScorer(Scorer scorer) throws IOException {} + + @Override + public void collect(int doc) throws IOException { + stats.accumulate(doc); + } + }; + } + + @Override + public boolean needsScores() { + return false; + } + +} diff --git a/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java b/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java new file mode 100644 index 00000000000..65f82e62d42 --- /dev/null +++ b/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.Arrays; +import java.util.stream.DoubleStream; +import java.util.stream.LongStream; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.DoubleDocValuesField; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.DocValuesStats.DoubleDocValuesStats; +import org.apache.lucene.search.DocValuesStats.LongDocValuesStats; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +/** Unit tests for {@link DocValuesStatsCollector}. */ +public class TestDocValuesStatsCollector extends LuceneTestCase { + + public void testNoDocsWithField() throws IOException { + try (Directory dir = newDirectory(); + IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { + int numDocs = TestUtil.nextInt(random(), 1, 100); + for (int i = 0; i < numDocs; i++) { + indexWriter.addDocument(new Document()); + } + + try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { + IndexSearcher searcher = new IndexSearcher(reader); + LongDocValuesStats stats = new LongDocValuesStats("foo"); + searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); + + assertEquals(0, stats.count()); + assertEquals(numDocs, stats.missing()); + } + } + } + + public void testRandomDocsWithLongValues() throws IOException { + try (Directory dir = newDirectory(); + IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { + String field = "numeric"; + int numDocs = TestUtil.nextInt(random(), 1, 100); + long[] docValues = new long[numDocs]; + int nextVal = 1; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (random().nextBoolean()) { // not all documents have a value + doc.add(new NumericDocValuesField(field, nextVal)); + doc.add(new StringField("id", "doc" + i, Store.NO)); + docValues[i] = nextVal; + ++nextVal; + } + indexWriter.addDocument(doc); + } + + // 20% of cases delete some docs + if (random().nextDouble() < 0.2) { + for (int i = 0; i < numDocs; i++) { + if (random().nextBoolean()) { + indexWriter.deleteDocuments(new Term("id", "doc" + i)); + docValues[i] = 0; + } + } + } + + try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { + IndexSearcher searcher = new IndexSearcher(reader); + LongDocValuesStats stats = new LongDocValuesStats(field); + searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); + + int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count(); + assertEquals(expCount, stats.count()); + assertEquals(getZeroValues(docValues).count() - reader.numDeletedDocs(), stats.missing()); + if (stats.count() > 0) { + assertEquals(getPositiveValues(docValues).max().getAsLong(), stats.max().longValue()); + assertEquals(getPositiveValues(docValues).min().getAsLong(), stats.min().longValue()); + assertEquals(getPositiveValues(docValues).average().getAsDouble(), stats.mean(), 0.00001); + } + } + } + } + + public void testRandomDocsWithDoubleValues() throws IOException { + try (Directory dir = newDirectory(); + IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { + String field = "numeric"; + int numDocs = TestUtil.nextInt(random(), 1, 100); + double[] docValues = new double[numDocs]; + double nextVal = 1.0; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (random().nextBoolean()) { // not all documents have a value + doc.add(new DoubleDocValuesField(field, nextVal)); + doc.add(new StringField("id", "doc" + i, Store.NO)); + docValues[i] = nextVal; + ++nextVal; + } + indexWriter.addDocument(doc); + } + + // 20% of cases delete some docs + if (random().nextDouble() < 0.2) { + for (int i = 0; i < numDocs; i++) { + if (random().nextBoolean()) { + indexWriter.deleteDocuments(new Term("id", "doc" + i)); + docValues[i] = 0; + } + } + } + + try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { + IndexSearcher searcher = new IndexSearcher(reader); + DoubleDocValuesStats stats = new DoubleDocValuesStats(field); + searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); + + int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count(); + assertEquals(expCount, stats.count()); + assertEquals(getZeroValues(docValues).count() - reader.numDeletedDocs(), stats.missing()); + if (stats.count() > 0) { + assertEquals(getPositiveValues(docValues).max().getAsDouble(), stats.max().doubleValue(), 0.00001); + assertEquals(getPositiveValues(docValues).min().getAsDouble(), stats.min().doubleValue(), 0.00001); + assertEquals(getPositiveValues(docValues).average().getAsDouble(), stats.mean(), 0.00001); + } + } + } + } + + private static LongStream getPositiveValues(long[] docValues) { + return Arrays.stream(docValues).filter(v -> v > 0); + } + + private static DoubleStream getPositiveValues(double[] docValues) { + return Arrays.stream(docValues).filter(v -> v > 0); + } + + private static LongStream getZeroValues(long[] docValues) { + return Arrays.stream(docValues).filter(v -> v == 0); + } + + private static DoubleStream getZeroValues(double[] docValues) { + return Arrays.stream(docValues).filter(v -> v == 0); + } + +}