mirror of https://github.com/apache/lucene.git
LUCENE-7590: add Sorted(Set)DocValuesStats
This commit is contained in:
parent
63a5cd0017
commit
47bb32c3bb
|
@ -21,7 +21,11 @@ import java.io.IOException;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.NumericDocValues;
|
import org.apache.lucene.index.NumericDocValues;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.index.SortedDocValues;
|
||||||
import org.apache.lucene.index.SortedNumericDocValues;
|
import org.apache.lucene.index.SortedNumericDocValues;
|
||||||
|
import org.apache.lucene.index.SortedSetDocValues;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
/** Holds statistics for a DocValues field. */
|
/** Holds statistics for a DocValues field. */
|
||||||
public abstract class DocValuesStats<T> {
|
public abstract class DocValuesStats<T> {
|
||||||
|
@ -332,4 +336,88 @@ public abstract class DocValuesStats<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static BytesRef copyFrom(BytesRef src, BytesRef dest) {
|
||||||
|
if (dest == null) {
|
||||||
|
return BytesRef.deepCopyOf(src);
|
||||||
|
}
|
||||||
|
|
||||||
|
dest.bytes = ArrayUtil.grow(dest.bytes, src.length);
|
||||||
|
System.arraycopy(src.bytes, src.offset, dest.bytes, 0, src.length);
|
||||||
|
dest.offset = 0;
|
||||||
|
dest.length = src.length;
|
||||||
|
return dest;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Holds statistics for a sorted DocValues field. */
|
||||||
|
public static class SortedDocValuesStats extends DocValuesStats<BytesRef> {
|
||||||
|
|
||||||
|
protected SortedDocValues sdv;
|
||||||
|
protected Bits docsWithField;
|
||||||
|
|
||||||
|
protected SortedDocValuesStats(String field) {
|
||||||
|
super(field, null, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final boolean init(LeafReaderContext context) throws IOException {
|
||||||
|
sdv = context.reader().getSortedDocValues(field);
|
||||||
|
docsWithField = context.reader().getDocsWithField(field);
|
||||||
|
return sdv != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final boolean hasValue(int doc) throws IOException {
|
||||||
|
return docsWithField.get(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void doAccumulate(int doc, int count) throws IOException {
|
||||||
|
BytesRef val = sdv.get(doc);
|
||||||
|
if (max == null || val.compareTo(max) > 0) {
|
||||||
|
max = copyFrom(val, max);
|
||||||
|
}
|
||||||
|
if (min == null || val.compareTo(min) < 0) {
|
||||||
|
min = copyFrom(val, min);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Holds statistics for a sorted-set DocValues field. */
|
||||||
|
public static class SortedSetDocValuesStats extends DocValuesStats<BytesRef> {
|
||||||
|
|
||||||
|
protected SortedSetDocValues ssdv;
|
||||||
|
protected Bits docsWithField;
|
||||||
|
|
||||||
|
protected SortedSetDocValuesStats(String field) {
|
||||||
|
super(field, null, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final boolean init(LeafReaderContext context) throws IOException {
|
||||||
|
ssdv = context.reader().getSortedSetDocValues(field);
|
||||||
|
docsWithField = context.reader().getDocsWithField(field);
|
||||||
|
return ssdv != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final boolean hasValue(int doc) throws IOException {
|
||||||
|
return docsWithField.get(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void doAccumulate(int doc, int count) throws IOException {
|
||||||
|
ssdv.setDocument(doc);
|
||||||
|
long ord;
|
||||||
|
while ((ord = ssdv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
|
||||||
|
BytesRef val = ssdv.lookupOrd(ord);
|
||||||
|
if (max == null || val.compareTo(max) > 0) {
|
||||||
|
max = copyFrom(val, max);
|
||||||
|
}
|
||||||
|
if (min == null || val.compareTo(min) < 0) {
|
||||||
|
min = copyFrom(val, min);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -20,6 +20,7 @@ import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.DoubleSummaryStatistics;
|
import java.util.DoubleSummaryStatistics;
|
||||||
import java.util.LongSummaryStatistics;
|
import java.util.LongSummaryStatistics;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
import java.util.stream.DoubleStream;
|
import java.util.stream.DoubleStream;
|
||||||
import java.util.stream.LongStream;
|
import java.util.stream.LongStream;
|
||||||
|
@ -29,16 +30,21 @@ import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.DoubleDocValuesField;
|
import org.apache.lucene.document.DoubleDocValuesField;
|
||||||
import org.apache.lucene.document.Field.Store;
|
import org.apache.lucene.document.Field.Store;
|
||||||
import org.apache.lucene.document.NumericDocValuesField;
|
import org.apache.lucene.document.NumericDocValuesField;
|
||||||
|
import org.apache.lucene.document.SortedDocValuesField;
|
||||||
import org.apache.lucene.document.SortedNumericDocValuesField;
|
import org.apache.lucene.document.SortedNumericDocValuesField;
|
||||||
|
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||||
import org.apache.lucene.document.StringField;
|
import org.apache.lucene.document.StringField;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.DocValuesStats.DoubleDocValuesStats;
|
import org.apache.lucene.search.DocValuesStats.DoubleDocValuesStats;
|
||||||
import org.apache.lucene.search.DocValuesStats.LongDocValuesStats;
|
import org.apache.lucene.search.DocValuesStats.LongDocValuesStats;
|
||||||
|
import org.apache.lucene.search.DocValuesStats.SortedDocValuesStats;
|
||||||
import org.apache.lucene.search.DocValuesStats.SortedDoubleDocValuesStats;
|
import org.apache.lucene.search.DocValuesStats.SortedDoubleDocValuesStats;
|
||||||
import org.apache.lucene.search.DocValuesStats.SortedLongDocValuesStats;
|
import org.apache.lucene.search.DocValuesStats.SortedLongDocValuesStats;
|
||||||
|
import org.apache.lucene.search.DocValuesStats.SortedSetDocValuesStats;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util.TestUtil;
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
|
@ -227,8 +233,8 @@ public class TestDocValuesStatsCollector extends LuceneTestCase {
|
||||||
SortedLongDocValuesStats stats = new SortedLongDocValuesStats(field);
|
SortedLongDocValuesStats stats = new SortedLongDocValuesStats(field);
|
||||||
searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
|
searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
|
||||||
|
|
||||||
assertEquals(filterValues(docValues, (v) -> v != null).count(), stats.count());
|
assertEquals(nonNull(docValues).count(), stats.count());
|
||||||
assertEquals(filterValues(docValues, (v) -> v == null).count() - reader.numDeletedDocs(), stats.missing());
|
assertEquals(isNull(docValues).count() - reader.numDeletedDocs(), stats.missing());
|
||||||
if (stats.count() > 0) {
|
if (stats.count() > 0) {
|
||||||
LongSummaryStatistics sumStats = filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics();
|
LongSummaryStatistics sumStats = filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics();
|
||||||
assertEquals(sumStats.getMax(), stats.max().longValue());
|
assertEquals(sumStats.getMax(), stats.max().longValue());
|
||||||
|
@ -281,8 +287,8 @@ public class TestDocValuesStatsCollector extends LuceneTestCase {
|
||||||
SortedDoubleDocValuesStats stats = new SortedDoubleDocValuesStats(field);
|
SortedDoubleDocValuesStats stats = new SortedDoubleDocValuesStats(field);
|
||||||
searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
|
searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
|
||||||
|
|
||||||
assertEquals(filterValues(docValues, (v) -> v != null).count(), stats.count());
|
assertEquals(nonNull(docValues).count(), stats.count());
|
||||||
assertEquals(filterValues(docValues, (v) -> v == null).count() - reader.numDeletedDocs(), stats.missing());
|
assertEquals(isNull(docValues).count() - reader.numDeletedDocs(), stats.missing());
|
||||||
if (stats.count() > 0) {
|
if (stats.count() > 0) {
|
||||||
DoubleSummaryStatistics sumStats = filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics();
|
DoubleSummaryStatistics sumStats = filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics();
|
||||||
assertEquals(sumStats.getMax(), stats.max().longValue(), 0.00001);
|
assertEquals(sumStats.getMax(), stats.max().longValue(), 0.00001);
|
||||||
|
@ -298,6 +304,96 @@ public class TestDocValuesStatsCollector extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testDocsWithSortedValues() throws IOException {
|
||||||
|
try (Directory dir = newDirectory();
|
||||||
|
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
|
||||||
|
String field = "sorted";
|
||||||
|
int numDocs = TestUtil.nextInt(random(), 1, 100);
|
||||||
|
BytesRef[] docValues = new BytesRef[numDocs];
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
if (random().nextBoolean()) { // not all documents have a value
|
||||||
|
BytesRef val = TestUtil.randomBinaryTerm(random());
|
||||||
|
doc.add(new SortedDocValuesField(field, val));
|
||||||
|
doc.add(new StringField("id", "doc" + i, Store.NO));
|
||||||
|
docValues[i] = val;
|
||||||
|
}
|
||||||
|
indexWriter.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 20% of cases delete some docs
|
||||||
|
if (random().nextDouble() < 0.2) {
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
indexWriter.deleteDocuments(new Term("id", "doc" + i));
|
||||||
|
docValues[i] = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
|
||||||
|
IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
SortedDocValuesStats stats = new SortedDocValuesStats(field);
|
||||||
|
searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
|
||||||
|
|
||||||
|
int expCount = (int) nonNull(docValues).count();
|
||||||
|
assertEquals(expCount, stats.count());
|
||||||
|
assertEquals(isNull(docValues).count() - reader.numDeletedDocs(), stats.missing());
|
||||||
|
if (stats.count() > 0) {
|
||||||
|
assertEquals(nonNull(docValues).min(BytesRef::compareTo).get(), stats.min());
|
||||||
|
assertEquals(nonNull(docValues).max(BytesRef::compareTo).get(), stats.max());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDocsWithSortedSetValues() throws IOException {
|
||||||
|
try (Directory dir = newDirectory();
|
||||||
|
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
|
||||||
|
String field = "sorted";
|
||||||
|
int numDocs = TestUtil.nextInt(random(), 1, 100);
|
||||||
|
BytesRef[][] docValues = new BytesRef[numDocs][];
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
if (random().nextBoolean()) { // not all documents have a value
|
||||||
|
int numValues = TestUtil.nextInt(random(), 1, 5);
|
||||||
|
docValues[i] = new BytesRef[numValues];
|
||||||
|
for (int j = 0; j < numValues; j++) {
|
||||||
|
BytesRef val = TestUtil.randomBinaryTerm(random());
|
||||||
|
doc.add(new SortedSetDocValuesField(field, val));
|
||||||
|
docValues[i][j] = val;
|
||||||
|
}
|
||||||
|
doc.add(new StringField("id", "doc" + i, Store.NO));
|
||||||
|
}
|
||||||
|
indexWriter.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 20% of cases delete some docs
|
||||||
|
if (random().nextDouble() < 0.2) {
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
indexWriter.deleteDocuments(new Term("id", "doc" + i));
|
||||||
|
docValues[i] = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
|
||||||
|
IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
SortedSetDocValuesStats stats = new SortedSetDocValuesStats(field);
|
||||||
|
searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
|
||||||
|
|
||||||
|
int expCount = (int) nonNull(docValues).count();
|
||||||
|
assertEquals(expCount, stats.count());
|
||||||
|
assertEquals(isNull(docValues).count() - reader.numDeletedDocs(), stats.missing());
|
||||||
|
if (stats.count() > 0) {
|
||||||
|
assertEquals(nonNull(docValues).flatMap(Arrays::stream).min(BytesRef::compareTo).get(), stats.min());
|
||||||
|
assertEquals(nonNull(docValues).flatMap(Arrays::stream).max(BytesRef::compareTo).get(), stats.max());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static LongStream getPositiveValues(long[] values) {
|
private static LongStream getPositiveValues(long[] values) {
|
||||||
return Arrays.stream(values).filter(v -> v > 0);
|
return Arrays.stream(values).filter(v -> v > 0);
|
||||||
}
|
}
|
||||||
|
@ -314,14 +410,6 @@ public class TestDocValuesStatsCollector extends LuceneTestCase {
|
||||||
return Arrays.stream(values).filter(v -> v == 0);
|
return Arrays.stream(values).filter(v -> v == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Stream<long[]> filterValues(long[][] values, Predicate<? super long[]> p) {
|
|
||||||
return Arrays.stream(values).filter(p);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Stream<double[]> filterValues(double[][] values, Predicate<? super double[]> p) {
|
|
||||||
return Arrays.stream(values).filter(p);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static double computeVariance(long[] values, double mean, int count) {
|
private static double computeVariance(long[] values, double mean, int count) {
|
||||||
return getPositiveValues(values).mapToDouble(v -> (v - mean) * (v-mean)).sum() / count;
|
return getPositiveValues(values).mapToDouble(v -> (v - mean) * (v-mean)).sum() / count;
|
||||||
}
|
}
|
||||||
|
@ -331,11 +419,11 @@ public class TestDocValuesStatsCollector extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static LongStream filterAndFlatValues(long[][] values, Predicate<? super long[]> p) {
|
private static LongStream filterAndFlatValues(long[][] values, Predicate<? super long[]> p) {
|
||||||
return filterValues(values, (v) -> v != null).flatMapToLong(Arrays::stream);
|
return nonNull(values).flatMapToLong(Arrays::stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static DoubleStream filterAndFlatValues(double[][] values, Predicate<? super double[]> p) {
|
private static DoubleStream filterAndFlatValues(double[][] values, Predicate<? super double[]> p) {
|
||||||
return filterValues(values, (v) -> v != null).flatMapToDouble(Arrays::stream);
|
return nonNull(values).flatMapToDouble(Arrays::stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static double computeVariance(LongStream values, double mean, int count) {
|
private static double computeVariance(LongStream values, double mean, int count) {
|
||||||
|
@ -346,4 +434,15 @@ public class TestDocValuesStatsCollector extends LuceneTestCase {
|
||||||
return values.map(v -> (v - mean) * (v-mean)).sum() / count;
|
return values.map(v -> (v - mean) * (v-mean)).sum() / count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static <T> Stream<T> nonNull(T[] values) {
|
||||||
|
return filterValues(values, Objects::nonNull);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <T> Stream<T> isNull(T[] values) {
|
||||||
|
return filterValues(values, Objects::isNull);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <T> Stream<T> filterValues(T[] values, Predicate<? super T> p) {
|
||||||
|
return Arrays.stream(values).filter(p);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue