From 944b8e07f557b9320895998fe33d71cae5199eee Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Sat, 17 Dec 2016 21:17:14 +0200 Subject: [PATCH] LUCENE-7590: add DocValuesStats for SortedNumeric DV fields --- .../apache/lucene/search/DocValuesStats.java | 128 ++++++++++++++- .../search/TestDocValuesStatsCollector.java | 153 +++++++++++++++++- 2 files changed, 271 insertions(+), 10 deletions(-) diff --git a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java index c8b775200d2..9dd97a60232 100644 --- a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java +++ b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java @@ -20,6 +20,7 @@ import java.io.IOException; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SortedNumericDocValues; /** Holds statistics for a DocValues field. */ public abstract class DocValuesStats { @@ -95,7 +96,7 @@ public abstract class DocValuesStats { } /** Holds statistics for a numeric DocValues field. */ - public static abstract class NumericDocValuesStats extends DocValuesStats { + static abstract class NumericDocValuesStats extends DocValuesStats { protected double mean = 0.0; protected double variance = 0.0; @@ -113,7 +114,7 @@ public abstract class DocValuesStats { } @Override - protected boolean hasValue(int doc) throws IOException { + protected final boolean hasValue(int doc) throws IOException { return ndv.advanceExact(doc); } @@ -199,4 +200,127 @@ public abstract class DocValuesStats { } } + /** Holds statistics for a sorted-numeric DocValues field. */ + static abstract class SortedNumericDocValuesStats extends DocValuesStats { + + protected long valuesCount = 0; + protected double mean = 0.0; + protected double variance = 0.0; + + protected SortedNumericDocValues sndv; + + protected SortedNumericDocValuesStats(String field, T initialMin, T initialMax) { + super(field, initialMin, initialMax); + } + + @Override + protected final boolean init(LeafReaderContext context) throws IOException { + sndv = context.reader().getSortedNumericDocValues(field); + return sndv != null; + } + + @Override + protected final boolean hasValue(int doc) throws IOException { + return sndv.advanceExact(doc); + } + + /** The mean of all values of the field. */ + public final double mean() { + return mean; + } + + /** Returns the variance of all values of the field. */ + public final double variance() { + int count = count(); + return count > 0 ? variance / count : 0; + } + + /** Returns the stdev of all values of the field. */ + public final double stdev() { + return Math.sqrt(variance()); + } + + /** Returns the total number of values for this field. */ + public final long valuesCount() { + return valuesCount; + } + + /** Returns the sum of values of the field. Note that if the values are large, the {@code sum} might overflow. */ + public abstract T sum(); + } + + /** Holds DocValues statistics for a sorted-numeric field storing {@code long} values. */ + public static final class SortedLongDocValuesStats extends SortedNumericDocValuesStats { + + // To avoid boxing 'long' to 'Long' while the sum is computed, declare it as private variable. + private long sum = 0; + + public SortedLongDocValuesStats(String field) { + super(field, Long.MAX_VALUE, Long.MIN_VALUE); + } + + @Override + protected void doAccumulate(int count) throws IOException { + int numValues = sndv.docValueCount(); + while (numValues-- > 0) { + long val = sndv.nextValue(); + if (val > max) { + max = val; + } + if (val < min) { + min = val; + } + sum += val; + double oldMean = mean; + // for correct "running average computation", increase valuesCount with each value, rather than once before the + // loop stats. + ++valuesCount; + mean += (val - mean) / valuesCount; + variance += (val - mean) * (val - oldMean); + } + } + + @Override + public Long sum() { + return sum; + } + } + + /** Holds DocValues statistics for a sorted-numeric field storing {@code double} values. */ + public static final class SortedDoubleDocValuesStats extends SortedNumericDocValuesStats { + + // To avoid boxing 'double' to 'Double' while the sum is computed, declare it as private variable. + private double sum = 0; + + public SortedDoubleDocValuesStats(String field) { + super(field, Double.MAX_VALUE, Double.MIN_VALUE); + } + + @Override + protected void doAccumulate(int count) throws IOException { + int numValues = sndv.docValueCount(); + while (numValues-- > 0) { + double val = Double.longBitsToDouble(sndv.nextValue()); + if (Double.compare(val, max) > 0) { + max = val; + } + if (Double.compare(val, min) < 0) { + min = val; + } + sum += val; + double oldMean = mean; + // for correct "running average computation", increase valuesCount with each value, rather than once before the + // loop stats. + ++valuesCount; + mean += (val - mean) / valuesCount; + variance += (val - mean) * (val - oldMean); + } + } + + @Override + public Double sum() { + return sum; + } + } + } \ No newline at end of file diff --git a/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java b/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java index 8f8b09e6bac..5fa4b04a196 100644 --- a/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java +++ b/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java @@ -20,19 +20,24 @@ import java.io.IOException; import java.util.Arrays; import java.util.DoubleSummaryStatistics; import java.util.LongSummaryStatistics; +import java.util.function.Predicate; import java.util.stream.DoubleStream; import java.util.stream.LongStream; +import java.util.stream.Stream; import org.apache.lucene.document.Document; import org.apache.lucene.document.DoubleDocValuesField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.DocValuesStats.DoubleDocValuesStats; import org.apache.lucene.search.DocValuesStats.LongDocValuesStats; +import org.apache.lucene.search.DocValuesStats.SortedDoubleDocValuesStats; +import org.apache.lucene.search.DocValuesStats.SortedLongDocValuesStats; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; @@ -185,20 +190,136 @@ public class TestDocValuesStatsCollector extends LuceneTestCase { } } - private static LongStream getPositiveValues(long[] docValues) { - return Arrays.stream(docValues).filter(v -> v > 0); + public void testDocsWithMultipleLongValues() throws IOException { + try (Directory dir = newDirectory(); + IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { + String field = "numeric"; + int numDocs = TestUtil.nextInt(random(), 1, 100); + long[][] docValues = new long[numDocs][]; + long nextVal = 1; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (random().nextBoolean()) { // not all documents have a value + int numValues = TestUtil.nextInt(random(), 1, 5); + docValues[i] = new long[numValues]; + for (int j = 0; j < numValues; j++) { + doc.add(new SortedNumericDocValuesField(field, nextVal)); + docValues[i][j] = nextVal; + ++nextVal; + } + doc.add(new StringField("id", "doc" + i, Store.NO)); + } + indexWriter.addDocument(doc); + } + + // 20% of cases delete some docs + if (random().nextDouble() < 0.2) { + for (int i = 0; i < numDocs; i++) { + if (random().nextBoolean()) { + indexWriter.deleteDocuments(new Term("id", "doc" + i)); + docValues[i] = null; + } + } + } + + try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { + IndexSearcher searcher = new IndexSearcher(reader); + SortedLongDocValuesStats stats = new SortedLongDocValuesStats(field); + searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); + + assertEquals(filterValues(docValues, (v) -> v != null).count(), stats.count()); + assertEquals(filterValues(docValues, (v) -> v == null).count() - reader.numDeletedDocs(), stats.missing()); + if (stats.count() > 0) { + LongSummaryStatistics sumStats = filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics(); + assertEquals(sumStats.getMax(), stats.max().longValue()); + assertEquals(sumStats.getMin(), stats.min().longValue()); + assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); + assertEquals(sumStats.getSum(), stats.sum().longValue()); + assertEquals(sumStats.getCount(), stats.valuesCount()); + double variance = computeVariance(filterAndFlatValues(docValues, (v) -> v != null), stats.mean, stats.count()); + assertEquals(variance, stats.variance(), 0.00001); + assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); + } + } + } } - private static DoubleStream getPositiveValues(double[] docValues) { - return Arrays.stream(docValues).filter(v -> v > 0); + public void testDocsWithMultipleDoubleValues() throws IOException { + try (Directory dir = newDirectory(); + IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { + String field = "numeric"; + int numDocs = TestUtil.nextInt(random(), 1, 100); + double[][] docValues = new double[numDocs][]; + double nextVal = 1; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (random().nextBoolean()) { // not all documents have a value + int numValues = TestUtil.nextInt(random(), 1, 5); + docValues[i] = new double[numValues]; + for (int j = 0; j < numValues; j++) { + doc.add(new SortedNumericDocValuesField(field, Double.doubleToRawLongBits(nextVal))); + docValues[i][j] = nextVal; + ++nextVal; + } + doc.add(new StringField("id", "doc" + i, Store.NO)); + } + indexWriter.addDocument(doc); + } + + // 20% of cases delete some docs + if (random().nextDouble() < 0.2) { + for (int i = 0; i < numDocs; i++) { + if (random().nextBoolean()) { + indexWriter.deleteDocuments(new Term("id", "doc" + i)); + docValues[i] = null; + } + } + } + + try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { + IndexSearcher searcher = new IndexSearcher(reader); + SortedDoubleDocValuesStats stats = new SortedDoubleDocValuesStats(field); + searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); + + assertEquals(filterValues(docValues, (v) -> v != null).count(), stats.count()); + assertEquals(filterValues(docValues, (v) -> v == null).count() - reader.numDeletedDocs(), stats.missing()); + if (stats.count() > 0) { + DoubleSummaryStatistics sumStats = filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics(); + assertEquals(sumStats.getMax(), stats.max().longValue(), 0.00001); + assertEquals(sumStats.getMin(), stats.min().longValue(), 0.00001); + assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); + assertEquals(sumStats.getSum(), stats.sum().doubleValue(), 0.00001); + assertEquals(sumStats.getCount(), stats.valuesCount()); + double variance = computeVariance(filterAndFlatValues(docValues, (v) -> v != null), stats.mean, stats.count()); + assertEquals(variance, stats.variance(), 0.00001); + assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); + } + } + } } - private static LongStream getZeroValues(long[] docValues) { - return Arrays.stream(docValues).filter(v -> v == 0); + private static LongStream getPositiveValues(long[] values) { + return Arrays.stream(values).filter(v -> v > 0); } - private static DoubleStream getZeroValues(double[] docValues) { - return Arrays.stream(docValues).filter(v -> v == 0); + private static DoubleStream getPositiveValues(double[] values) { + return Arrays.stream(values).filter(v -> v > 0); + } + + private static LongStream getZeroValues(long[] values) { + return Arrays.stream(values).filter(v -> v == 0); + } + + private static DoubleStream getZeroValues(double[] values) { + return Arrays.stream(values).filter(v -> v == 0); + } + + private static Stream filterValues(long[][] values, Predicate p) { + return Arrays.stream(values).filter(p); + } + + private static Stream filterValues(double[][] values, Predicate p) { + return Arrays.stream(values).filter(p); } private static double computeVariance(long[] values, double mean, int count) { @@ -209,4 +330,20 @@ public class TestDocValuesStatsCollector extends LuceneTestCase { return getPositiveValues(values).map(v -> (v - mean) * (v-mean)).sum() / count; } + private static LongStream filterAndFlatValues(long[][] values, Predicate p) { + return filterValues(values, (v) -> v != null).flatMapToLong(Arrays::stream); + } + + private static DoubleStream filterAndFlatValues(double[][] values, Predicate p) { + return filterValues(values, (v) -> v != null).flatMapToDouble(Arrays::stream); + } + + private static double computeVariance(LongStream values, double mean, int count) { + return values.mapToDouble(v -> (v - mean) * (v-mean)).sum() / count; + } + + private static double computeVariance(DoubleStream values, double mean, int count) { + return values.map(v -> (v - mean) * (v-mean)).sum() / count; + } + }