mirror of https://github.com/apache/lucene.git
LUCENE-7590: add DocValuesStats for SortedNumeric DV fields
This commit is contained in:
parent
dcf202a958
commit
944b8e07f5
|
@ -20,6 +20,7 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.SortedNumericDocValues;
|
||||
|
||||
/** Holds statistics for a DocValues field. */
|
||||
public abstract class DocValuesStats<T> {
|
||||
|
@ -95,7 +96,7 @@ public abstract class DocValuesStats<T> {
|
|||
}
|
||||
|
||||
/** Holds statistics for a numeric DocValues field. */
|
||||
public static abstract class NumericDocValuesStats<T extends Number> extends DocValuesStats<T> {
|
||||
static abstract class NumericDocValuesStats<T extends Number> extends DocValuesStats<T> {
|
||||
|
||||
protected double mean = 0.0;
|
||||
protected double variance = 0.0;
|
||||
|
@ -113,7 +114,7 @@ public abstract class DocValuesStats<T> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected boolean hasValue(int doc) throws IOException {
|
||||
protected final boolean hasValue(int doc) throws IOException {
|
||||
return ndv.advanceExact(doc);
|
||||
}
|
||||
|
||||
|
@ -199,4 +200,127 @@ public abstract class DocValuesStats<T> {
|
|||
}
|
||||
}
|
||||
|
||||
/** Holds statistics for a sorted-numeric DocValues field. */
|
||||
static abstract class SortedNumericDocValuesStats<T extends Number> extends DocValuesStats<T> {
|
||||
|
||||
protected long valuesCount = 0;
|
||||
protected double mean = 0.0;
|
||||
protected double variance = 0.0;
|
||||
|
||||
protected SortedNumericDocValues sndv;
|
||||
|
||||
protected SortedNumericDocValuesStats(String field, T initialMin, T initialMax) {
|
||||
super(field, initialMin, initialMax);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final boolean init(LeafReaderContext context) throws IOException {
|
||||
sndv = context.reader().getSortedNumericDocValues(field);
|
||||
return sndv != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final boolean hasValue(int doc) throws IOException {
|
||||
return sndv.advanceExact(doc);
|
||||
}
|
||||
|
||||
/** The mean of all values of the field. */
|
||||
public final double mean() {
|
||||
return mean;
|
||||
}
|
||||
|
||||
/** Returns the variance of all values of the field. */
|
||||
public final double variance() {
|
||||
int count = count();
|
||||
return count > 0 ? variance / count : 0;
|
||||
}
|
||||
|
||||
/** Returns the stdev of all values of the field. */
|
||||
public final double stdev() {
|
||||
return Math.sqrt(variance());
|
||||
}
|
||||
|
||||
/** Returns the total number of values for this field. */
|
||||
public final long valuesCount() {
|
||||
return valuesCount;
|
||||
}
|
||||
|
||||
/** Returns the sum of values of the field. Note that if the values are large, the {@code sum} might overflow. */
|
||||
public abstract T sum();
|
||||
}
|
||||
|
||||
/** Holds DocValues statistics for a sorted-numeric field storing {@code long} values. */
|
||||
public static final class SortedLongDocValuesStats extends SortedNumericDocValuesStats<Long> {
|
||||
|
||||
// To avoid boxing 'long' to 'Long' while the sum is computed, declare it as private variable.
|
||||
private long sum = 0;
|
||||
|
||||
public SortedLongDocValuesStats(String field) {
|
||||
super(field, Long.MAX_VALUE, Long.MIN_VALUE);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doAccumulate(int count) throws IOException {
|
||||
int numValues = sndv.docValueCount();
|
||||
while (numValues-- > 0) {
|
||||
long val = sndv.nextValue();
|
||||
if (val > max) {
|
||||
max = val;
|
||||
}
|
||||
if (val < min) {
|
||||
min = val;
|
||||
}
|
||||
sum += val;
|
||||
double oldMean = mean;
|
||||
// for correct "running average computation", increase valuesCount with each value, rather than once before the
|
||||
// loop stats.
|
||||
++valuesCount;
|
||||
mean += (val - mean) / valuesCount;
|
||||
variance += (val - mean) * (val - oldMean);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long sum() {
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
|
||||
/** Holds DocValues statistics for a sorted-numeric field storing {@code double} values. */
|
||||
public static final class SortedDoubleDocValuesStats extends SortedNumericDocValuesStats<Double> {
|
||||
|
||||
// To avoid boxing 'double' to 'Double' while the sum is computed, declare it as private variable.
|
||||
private double sum = 0;
|
||||
|
||||
public SortedDoubleDocValuesStats(String field) {
|
||||
super(field, Double.MAX_VALUE, Double.MIN_VALUE);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doAccumulate(int count) throws IOException {
|
||||
int numValues = sndv.docValueCount();
|
||||
while (numValues-- > 0) {
|
||||
double val = Double.longBitsToDouble(sndv.nextValue());
|
||||
if (Double.compare(val, max) > 0) {
|
||||
max = val;
|
||||
}
|
||||
if (Double.compare(val, min) < 0) {
|
||||
min = val;
|
||||
}
|
||||
sum += val;
|
||||
double oldMean = mean;
|
||||
// for correct "running average computation", increase valuesCount with each value, rather than once before the
|
||||
// loop stats.
|
||||
++valuesCount;
|
||||
mean += (val - mean) / valuesCount;
|
||||
variance += (val - mean) * (val - oldMean);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Double sum() {
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -20,19 +20,24 @@ import java.io.IOException;
|
|||
import java.util.Arrays;
|
||||
import java.util.DoubleSummaryStatistics;
|
||||
import java.util.LongSummaryStatistics;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.DoubleStream;
|
||||
import java.util.stream.LongStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.DoubleDocValuesField;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.NumericDocValuesField;
|
||||
import org.apache.lucene.document.SortedNumericDocValuesField;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.DocValuesStats.DoubleDocValuesStats;
|
||||
import org.apache.lucene.search.DocValuesStats.LongDocValuesStats;
|
||||
import org.apache.lucene.search.DocValuesStats.SortedDoubleDocValuesStats;
|
||||
import org.apache.lucene.search.DocValuesStats.SortedLongDocValuesStats;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
@ -185,20 +190,136 @@ public class TestDocValuesStatsCollector extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private static LongStream getPositiveValues(long[] docValues) {
|
||||
return Arrays.stream(docValues).filter(v -> v > 0);
|
||||
public void testDocsWithMultipleLongValues() throws IOException {
|
||||
try (Directory dir = newDirectory();
|
||||
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
|
||||
String field = "numeric";
|
||||
int numDocs = TestUtil.nextInt(random(), 1, 100);
|
||||
long[][] docValues = new long[numDocs][];
|
||||
long nextVal = 1;
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
Document doc = new Document();
|
||||
if (random().nextBoolean()) { // not all documents have a value
|
||||
int numValues = TestUtil.nextInt(random(), 1, 5);
|
||||
docValues[i] = new long[numValues];
|
||||
for (int j = 0; j < numValues; j++) {
|
||||
doc.add(new SortedNumericDocValuesField(field, nextVal));
|
||||
docValues[i][j] = nextVal;
|
||||
++nextVal;
|
||||
}
|
||||
doc.add(new StringField("id", "doc" + i, Store.NO));
|
||||
}
|
||||
indexWriter.addDocument(doc);
|
||||
}
|
||||
|
||||
private static DoubleStream getPositiveValues(double[] docValues) {
|
||||
return Arrays.stream(docValues).filter(v -> v > 0);
|
||||
// 20% of cases delete some docs
|
||||
if (random().nextDouble() < 0.2) {
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
if (random().nextBoolean()) {
|
||||
indexWriter.deleteDocuments(new Term("id", "doc" + i));
|
||||
docValues[i] = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static LongStream getZeroValues(long[] docValues) {
|
||||
return Arrays.stream(docValues).filter(v -> v == 0);
|
||||
try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
SortedLongDocValuesStats stats = new SortedLongDocValuesStats(field);
|
||||
searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
|
||||
|
||||
assertEquals(filterValues(docValues, (v) -> v != null).count(), stats.count());
|
||||
assertEquals(filterValues(docValues, (v) -> v == null).count() - reader.numDeletedDocs(), stats.missing());
|
||||
if (stats.count() > 0) {
|
||||
LongSummaryStatistics sumStats = filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics();
|
||||
assertEquals(sumStats.getMax(), stats.max().longValue());
|
||||
assertEquals(sumStats.getMin(), stats.min().longValue());
|
||||
assertEquals(sumStats.getAverage(), stats.mean(), 0.00001);
|
||||
assertEquals(sumStats.getSum(), stats.sum().longValue());
|
||||
assertEquals(sumStats.getCount(), stats.valuesCount());
|
||||
double variance = computeVariance(filterAndFlatValues(docValues, (v) -> v != null), stats.mean, stats.count());
|
||||
assertEquals(variance, stats.variance(), 0.00001);
|
||||
assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static DoubleStream getZeroValues(double[] docValues) {
|
||||
return Arrays.stream(docValues).filter(v -> v == 0);
|
||||
public void testDocsWithMultipleDoubleValues() throws IOException {
|
||||
try (Directory dir = newDirectory();
|
||||
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
|
||||
String field = "numeric";
|
||||
int numDocs = TestUtil.nextInt(random(), 1, 100);
|
||||
double[][] docValues = new double[numDocs][];
|
||||
double nextVal = 1;
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
Document doc = new Document();
|
||||
if (random().nextBoolean()) { // not all documents have a value
|
||||
int numValues = TestUtil.nextInt(random(), 1, 5);
|
||||
docValues[i] = new double[numValues];
|
||||
for (int j = 0; j < numValues; j++) {
|
||||
doc.add(new SortedNumericDocValuesField(field, Double.doubleToRawLongBits(nextVal)));
|
||||
docValues[i][j] = nextVal;
|
||||
++nextVal;
|
||||
}
|
||||
doc.add(new StringField("id", "doc" + i, Store.NO));
|
||||
}
|
||||
indexWriter.addDocument(doc);
|
||||
}
|
||||
|
||||
// 20% of cases delete some docs
|
||||
if (random().nextDouble() < 0.2) {
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
if (random().nextBoolean()) {
|
||||
indexWriter.deleteDocuments(new Term("id", "doc" + i));
|
||||
docValues[i] = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
SortedDoubleDocValuesStats stats = new SortedDoubleDocValuesStats(field);
|
||||
searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
|
||||
|
||||
assertEquals(filterValues(docValues, (v) -> v != null).count(), stats.count());
|
||||
assertEquals(filterValues(docValues, (v) -> v == null).count() - reader.numDeletedDocs(), stats.missing());
|
||||
if (stats.count() > 0) {
|
||||
DoubleSummaryStatistics sumStats = filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics();
|
||||
assertEquals(sumStats.getMax(), stats.max().longValue(), 0.00001);
|
||||
assertEquals(sumStats.getMin(), stats.min().longValue(), 0.00001);
|
||||
assertEquals(sumStats.getAverage(), stats.mean(), 0.00001);
|
||||
assertEquals(sumStats.getSum(), stats.sum().doubleValue(), 0.00001);
|
||||
assertEquals(sumStats.getCount(), stats.valuesCount());
|
||||
double variance = computeVariance(filterAndFlatValues(docValues, (v) -> v != null), stats.mean, stats.count());
|
||||
assertEquals(variance, stats.variance(), 0.00001);
|
||||
assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static LongStream getPositiveValues(long[] values) {
|
||||
return Arrays.stream(values).filter(v -> v > 0);
|
||||
}
|
||||
|
||||
private static DoubleStream getPositiveValues(double[] values) {
|
||||
return Arrays.stream(values).filter(v -> v > 0);
|
||||
}
|
||||
|
||||
private static LongStream getZeroValues(long[] values) {
|
||||
return Arrays.stream(values).filter(v -> v == 0);
|
||||
}
|
||||
|
||||
private static DoubleStream getZeroValues(double[] values) {
|
||||
return Arrays.stream(values).filter(v -> v == 0);
|
||||
}
|
||||
|
||||
private static Stream<long[]> filterValues(long[][] values, Predicate<? super long[]> p) {
|
||||
return Arrays.stream(values).filter(p);
|
||||
}
|
||||
|
||||
private static Stream<double[]> filterValues(double[][] values, Predicate<? super double[]> p) {
|
||||
return Arrays.stream(values).filter(p);
|
||||
}
|
||||
|
||||
private static double computeVariance(long[] values, double mean, int count) {
|
||||
|
@ -209,4 +330,20 @@ public class TestDocValuesStatsCollector extends LuceneTestCase {
|
|||
return getPositiveValues(values).map(v -> (v - mean) * (v-mean)).sum() / count;
|
||||
}
|
||||
|
||||
private static LongStream filterAndFlatValues(long[][] values, Predicate<? super long[]> p) {
|
||||
return filterValues(values, (v) -> v != null).flatMapToLong(Arrays::stream);
|
||||
}
|
||||
|
||||
private static DoubleStream filterAndFlatValues(double[][] values, Predicate<? super double[]> p) {
|
||||
return filterValues(values, (v) -> v != null).flatMapToDouble(Arrays::stream);
|
||||
}
|
||||
|
||||
private static double computeVariance(LongStream values, double mean, int count) {
|
||||
return values.mapToDouble(v -> (v - mean) * (v-mean)).sum() / count;
|
||||
}
|
||||
|
||||
private static double computeVariance(DoubleStream values, double mean, int count) {
|
||||
return values.map(v -> (v - mean) * (v-mean)).sum() / count;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue