diff --git a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java index 9dd97a60232..b6449cca1ad 100644 --- a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java +++ b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java @@ -20,7 +20,11 @@ import java.io.IOException; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; /** Holds statistics for a DocValues field. */ public abstract class DocValuesStats { @@ -323,4 +327,83 @@ public abstract class DocValuesStats { } } + private static BytesRef copyFrom(BytesRef src, BytesRef dest) { + if (dest == null) { + return BytesRef.deepCopyOf(src); + } + + dest.bytes = ArrayUtil.grow(dest.bytes, src.length); + System.arraycopy(src.bytes, src.offset, dest.bytes, 0, src.length); + dest.offset = 0; + dest.length = src.length; + return dest; + } + + /** Holds statistics for a sorted DocValues field. */ + public static class SortedDocValuesStats extends DocValuesStats { + + protected SortedDocValues sdv; + + protected SortedDocValuesStats(String field) { + super(field, null, null); + } + + @Override + protected final boolean init(LeafReaderContext context) throws IOException { + sdv = context.reader().getSortedDocValues(field); + return sdv != null; + } + + @Override + protected final boolean hasValue(int doc) throws IOException { + return sdv.advanceExact(doc); + } + + @Override + protected void doAccumulate(int count) throws IOException { + BytesRef val = sdv.binaryValue(); + if (max == null || val.compareTo(max) > 0) { + max = copyFrom(val, max); + } + if (min == null || val.compareTo(min) < 0) { + min = copyFrom(val, min); + } + } + } + + /** Holds statistics for a sorted-set DocValues field. */ + public static class SortedSetDocValuesStats extends DocValuesStats { + + protected SortedSetDocValues ssdv; + + protected SortedSetDocValuesStats(String field) { + super(field, null, null); + } + + @Override + protected final boolean init(LeafReaderContext context) throws IOException { + ssdv = context.reader().getSortedSetDocValues(field); + return ssdv != null; + } + + @Override + protected final boolean hasValue(int doc) throws IOException { + return ssdv.advanceExact(doc); + } + + @Override + protected void doAccumulate(int count) throws IOException { + long ord; + while ((ord = ssdv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { + BytesRef val = ssdv.lookupOrd(ord); + if (max == null || val.compareTo(max) > 0) { + max = copyFrom(val, max); + } + if (min == null || val.compareTo(min) < 0) { + min = copyFrom(val, min); + } + } + } + } + } \ No newline at end of file diff --git a/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java b/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java index 5fa4b04a196..3471c031911 100644 --- a/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java +++ b/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Arrays; import java.util.DoubleSummaryStatistics; import java.util.LongSummaryStatistics; +import java.util.Objects; import java.util.function.Predicate; import java.util.stream.DoubleStream; import java.util.stream.LongStream; @@ -29,16 +30,21 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.DoubleDocValuesField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.DocValuesStats.DoubleDocValuesStats; import org.apache.lucene.search.DocValuesStats.LongDocValuesStats; +import org.apache.lucene.search.DocValuesStats.SortedDocValuesStats; import org.apache.lucene.search.DocValuesStats.SortedDoubleDocValuesStats; import org.apache.lucene.search.DocValuesStats.SortedLongDocValuesStats; +import org.apache.lucene.search.DocValuesStats.SortedSetDocValuesStats; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; @@ -227,8 +233,8 @@ public class TestDocValuesStatsCollector extends LuceneTestCase { SortedLongDocValuesStats stats = new SortedLongDocValuesStats(field); searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); - assertEquals(filterValues(docValues, (v) -> v != null).count(), stats.count()); - assertEquals(filterValues(docValues, (v) -> v == null).count() - reader.numDeletedDocs(), stats.missing()); + assertEquals(nonNull(docValues).count(), stats.count()); + assertEquals(isNull(docValues).count() - reader.numDeletedDocs(), stats.missing()); if (stats.count() > 0) { LongSummaryStatistics sumStats = filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics(); assertEquals(sumStats.getMax(), stats.max().longValue()); @@ -281,8 +287,8 @@ public class TestDocValuesStatsCollector extends LuceneTestCase { SortedDoubleDocValuesStats stats = new SortedDoubleDocValuesStats(field); searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); - assertEquals(filterValues(docValues, (v) -> v != null).count(), stats.count()); - assertEquals(filterValues(docValues, (v) -> v == null).count() - reader.numDeletedDocs(), stats.missing()); + assertEquals(nonNull(docValues).count(), stats.count()); + assertEquals(isNull(docValues).count() - reader.numDeletedDocs(), stats.missing()); if (stats.count() > 0) { DoubleSummaryStatistics sumStats = filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics(); assertEquals(sumStats.getMax(), stats.max().longValue(), 0.00001); @@ -298,6 +304,96 @@ public class TestDocValuesStatsCollector extends LuceneTestCase { } } + public void testDocsWithSortedValues() throws IOException { + try (Directory dir = newDirectory(); + IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { + String field = "sorted"; + int numDocs = TestUtil.nextInt(random(), 1, 100); + BytesRef[] docValues = new BytesRef[numDocs]; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (random().nextBoolean()) { // not all documents have a value + BytesRef val = TestUtil.randomBinaryTerm(random()); + doc.add(new SortedDocValuesField(field, val)); + doc.add(new StringField("id", "doc" + i, Store.NO)); + docValues[i] = val; + } + indexWriter.addDocument(doc); + } + + // 20% of cases delete some docs + if (random().nextDouble() < 0.2) { + for (int i = 0; i < numDocs; i++) { + if (random().nextBoolean()) { + indexWriter.deleteDocuments(new Term("id", "doc" + i)); + docValues[i] = null; + } + } + } + + try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { + IndexSearcher searcher = new IndexSearcher(reader); + SortedDocValuesStats stats = new SortedDocValuesStats(field); + searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); + + int expCount = (int) nonNull(docValues).count(); + assertEquals(expCount, stats.count()); + assertEquals(isNull(docValues).count() - reader.numDeletedDocs(), stats.missing()); + if (stats.count() > 0) { + assertEquals(nonNull(docValues).min(BytesRef::compareTo).get(), stats.min()); + assertEquals(nonNull(docValues).max(BytesRef::compareTo).get(), stats.max()); + } + } + } + } + + public void testDocsWithSortedSetValues() throws IOException { + try (Directory dir = newDirectory(); + IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { + String field = "sorted"; + int numDocs = TestUtil.nextInt(random(), 1, 100); + BytesRef[][] docValues = new BytesRef[numDocs][]; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (random().nextBoolean()) { // not all documents have a value + int numValues = TestUtil.nextInt(random(), 1, 5); + docValues[i] = new BytesRef[numValues]; + for (int j = 0; j < numValues; j++) { + BytesRef val = TestUtil.randomBinaryTerm(random()); + doc.add(new SortedSetDocValuesField(field, val)); + docValues[i][j] = val; + } + doc.add(new StringField("id", "doc" + i, Store.NO)); + } + indexWriter.addDocument(doc); + } + + // 20% of cases delete some docs + if (random().nextDouble() < 0.2) { + for (int i = 0; i < numDocs; i++) { + if (random().nextBoolean()) { + indexWriter.deleteDocuments(new Term("id", "doc" + i)); + docValues[i] = null; + } + } + } + + try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { + IndexSearcher searcher = new IndexSearcher(reader); + SortedSetDocValuesStats stats = new SortedSetDocValuesStats(field); + searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); + + int expCount = (int) nonNull(docValues).count(); + assertEquals(expCount, stats.count()); + assertEquals(isNull(docValues).count() - reader.numDeletedDocs(), stats.missing()); + if (stats.count() > 0) { + assertEquals(nonNull(docValues).flatMap(Arrays::stream).min(BytesRef::compareTo).get(), stats.min()); + assertEquals(nonNull(docValues).flatMap(Arrays::stream).max(BytesRef::compareTo).get(), stats.max()); + } + } + } + } + private static LongStream getPositiveValues(long[] values) { return Arrays.stream(values).filter(v -> v > 0); } @@ -314,14 +410,6 @@ public class TestDocValuesStatsCollector extends LuceneTestCase { return Arrays.stream(values).filter(v -> v == 0); } - private static Stream filterValues(long[][] values, Predicate p) { - return Arrays.stream(values).filter(p); - } - - private static Stream filterValues(double[][] values, Predicate p) { - return Arrays.stream(values).filter(p); - } - private static double computeVariance(long[] values, double mean, int count) { return getPositiveValues(values).mapToDouble(v -> (v - mean) * (v-mean)).sum() / count; } @@ -331,11 +419,11 @@ public class TestDocValuesStatsCollector extends LuceneTestCase { } private static LongStream filterAndFlatValues(long[][] values, Predicate p) { - return filterValues(values, (v) -> v != null).flatMapToLong(Arrays::stream); + return nonNull(values).flatMapToLong(Arrays::stream); } private static DoubleStream filterAndFlatValues(double[][] values, Predicate p) { - return filterValues(values, (v) -> v != null).flatMapToDouble(Arrays::stream); + return nonNull(values).flatMapToDouble(Arrays::stream); } private static double computeVariance(LongStream values, double mean, int count) { @@ -346,4 +434,15 @@ public class TestDocValuesStatsCollector extends LuceneTestCase { return values.map(v -> (v - mean) * (v-mean)).sum() / count; } + private static Stream nonNull(T[] values) { + return filterValues(values, Objects::nonNull); + } + + private static Stream isNull(T[] values) { + return filterValues(values, Objects::isNull); + } + + private static Stream filterValues(T[] values, Predicate p) { + return Arrays.stream(values).filter(p); + } }