Align doc value skipper interval boundaries when an interval contains a constant value (#13597)

keep adding documents to an skipper interval while it is dense and single valued.
This commit is contained in:
Ignacio Vera 2024-07-22 14:52:44 +02:00 committed by GitHub
parent cc3b412183
commit 7709f575ef
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 202 additions and 16 deletions

View File

@ -135,6 +135,9 @@ New Features
* GITHUB#13563: Add levels to doc values skip index. (Ignacio Vera)
* GITHUB#13597: Align doc value skipper interval boundaries when an interval contains a constant
value. (Ignacio Vera)
Improvements
---------------------

View File

@ -205,6 +205,20 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
docCount = 0;
}
boolean isDone(int skipIndexIntervalSize, int valueCount, long nextValue, int nextDoc) {
if (docCount < skipIndexIntervalSize) {
return false;
}
// Once we reach the interval size, we will keep accepting documents if
// - next doc value is not a multi-value
// - current accumulator only contains a single value and next value is the same value
// - the accumulator is dense and the next doc keeps the density (no gaps)
return valueCount > 1
|| minValue != maxValue
|| minValue != nextValue
|| docCount != nextDoc - minDocID;
}
void accumulate(long value) {
minValue = Math.min(minValue, value);
maxValue = Math.max(maxValue, value);
@ -245,15 +259,9 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
SkipAccumulator accumulator = null;
final int maxAccumulators = 1 << (SKIP_INDEX_LEVEL_SHIFT * (SKIP_INDEX_MAX_LEVEL - 1));
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
if (accumulator == null) {
accumulator = new SkipAccumulator(doc);
accumulators.add(accumulator);
}
accumulator.nextDoc(doc);
for (int i = 0, end = values.docValueCount(); i < end; ++i) {
accumulator.accumulate(values.nextValue());
}
if (accumulator.docCount == skipIndexIntervalSize) {
final long firstValue = values.nextValue();
if (accumulator != null
&& accumulator.isDone(skipIndexIntervalSize, values.docValueCount(), firstValue, doc)) {
globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
globalMinValue = Math.min(globalMinValue, accumulator.minValue);
globalDocCount += accumulator.docCount;
@ -264,15 +272,22 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
accumulators.clear();
}
}
if (accumulator == null) {
accumulator = new SkipAccumulator(doc);
accumulators.add(accumulator);
}
accumulator.nextDoc(doc);
accumulator.accumulate(firstValue);
for (int i = 1, end = values.docValueCount(); i < end; ++i) {
accumulator.accumulate(values.nextValue());
}
}
if (accumulators.isEmpty() == false) {
if (accumulator != null) {
globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
globalMinValue = Math.min(globalMinValue, accumulator.minValue);
globalDocCount += accumulator.docCount;
maxDocId = accumulator.maxDocID;
}
globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
globalMinValue = Math.min(globalMinValue, accumulator.minValue);
globalDocCount += accumulator.docCount;
maxDocId = accumulator.maxDocID;
writeLevels(accumulators);
}
meta.writeLong(start); // record the start in meta

View File

@ -16,11 +16,24 @@
*/
package org.apache.lucene.codecs.lucene90;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.index.DocValuesSkipper;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.BaseDocValuesFormatTestCase;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.util.TestUtil;
/** Tests Lucene90DocValuesFormat */
/** Tests Lucene90DocValuesFormat with custom skipper interval size */
public class TestLucene90DocValuesFormatVariableSkipInterval extends BaseDocValuesFormatTestCase {
@Override
@ -36,4 +49,159 @@ public class TestLucene90DocValuesFormatVariableSkipInterval extends BaseDocValu
() -> new Lucene90DocValuesFormat(random().nextInt(Integer.MIN_VALUE, 2)));
assertTrue(ex.getMessage().contains("skipIndexIntervalSize must be > 1"));
}
public void testSkipperAllEqualValue() throws IOException {
final IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec());
try (Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory, config)) {
final int numDocs = atLeast(100);
for (int i = 0; i < numDocs; i++) {
final Document doc = new Document();
doc.add(NumericDocValuesField.indexedField("dv", 0L));
writer.addDocument(doc);
}
writer.forceMerge(1);
try (IndexReader reader = writer.getReader()) {
assertEquals(1, reader.leaves().size());
final DocValuesSkipper skipper = reader.leaves().get(0).reader().getDocValuesSkipper("dv");
assertNotNull(skipper);
skipper.advance(0);
assertEquals(0L, skipper.minValue(0));
assertEquals(0L, skipper.maxValue(0));
assertEquals(numDocs, skipper.docCount(0));
skipper.advance(skipper.maxDocID(0) + 1);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, skipper.minDocID(0));
}
}
}
// break on different value
public void testSkipperFewValuesSorted() throws IOException {
final IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec());
boolean reverse = random().nextBoolean();
config.setIndexSort(new Sort(new SortField("dv", SortField.Type.LONG, reverse)));
try (Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory, config)) {
final int intervals = random().nextInt(2, 10);
final int[] numDocs = new int[intervals];
for (int i = 0; i < intervals; i++) {
numDocs[i] = random().nextInt(10) + 16;
for (int j = 0; j < numDocs[i]; j++) {
final Document doc = new Document();
doc.add(NumericDocValuesField.indexedField("dv", i));
writer.addDocument(doc);
}
}
writer.forceMerge(1);
try (IndexReader reader = writer.getReader()) {
assertEquals(1, reader.leaves().size());
final DocValuesSkipper skipper = reader.leaves().get(0).reader().getDocValuesSkipper("dv");
assertNotNull(skipper);
assertEquals(Arrays.stream(numDocs).sum(), skipper.docCount());
skipper.advance(0);
if (reverse) {
for (int i = intervals - 1; i >= 0; i--) {
assertEquals(i, skipper.minValue(0));
assertEquals(i, skipper.maxValue(0));
assertEquals(numDocs[i], skipper.docCount(0));
skipper.advance(skipper.maxDocID(0) + 1);
}
} else {
for (int i = 0; i < intervals; i++) {
assertEquals(i, skipper.minValue(0));
assertEquals(i, skipper.maxValue(0));
assertEquals(numDocs[i], skipper.docCount(0));
skipper.advance(skipper.maxDocID(0) + 1);
}
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, skipper.minDocID(0));
}
}
}
// break on empty doc values
public void testSkipperAllEqualValueWithGaps() throws IOException {
final IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec());
config.setIndexSort(new Sort(new SortField("sort", SortField.Type.LONG, false)));
try (Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory, config)) {
final int gaps = random().nextInt(2, 10);
final int[] numDocs = new int[gaps];
long totaldocs = 0;
for (int i = 0; i < gaps; i++) {
numDocs[i] = random().nextInt(10) + 16;
for (int j = 0; j < numDocs[i]; j++) {
final Document doc = new Document();
doc.add(new NumericDocValuesField("sort", totaldocs++));
doc.add(SortedNumericDocValuesField.indexedField("dv", 0L));
writer.addDocument(doc);
}
// add doc with empty "dv"
final Document doc = new Document();
doc.add(new NumericDocValuesField("sort", totaldocs++));
writer.addDocument(doc);
}
writer.forceMerge(1);
try (IndexReader reader = writer.getReader()) {
assertEquals(1, reader.leaves().size());
final DocValuesSkipper skipper = reader.leaves().get(0).reader().getDocValuesSkipper("dv");
assertNotNull(skipper);
assertEquals(Arrays.stream(numDocs).sum(), skipper.docCount());
skipper.advance(0);
for (int i = 0; i < gaps; i++) {
assertEquals(0L, skipper.minValue(0));
assertEquals(0L, skipper.maxValue(0));
assertEquals(numDocs[i], skipper.docCount(0));
skipper.advance(skipper.maxDocID(0) + 1);
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, skipper.minDocID(0));
}
}
}
// break on multi-values
public void testSkipperAllEqualValueWithMultiValues() throws IOException {
final IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec());
config.setIndexSort(new Sort(new SortField("sort", SortField.Type.LONG, false)));
try (Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory, config)) {
final int gaps = random().nextInt(2, 10);
final int[] numDocs = new int[gaps];
long totaldocs = 0;
for (int i = 0; i < gaps; i++) {
int docs = random().nextInt(10) + 16;
numDocs[i] += docs;
for (int j = 0; j < docs; j++) {
final Document doc = new Document();
doc.add(new NumericDocValuesField("sort", totaldocs++));
doc.add(SortedNumericDocValuesField.indexedField("dv", 0L));
writer.addDocument(doc);
}
if (i != gaps - 1) {
// add doc with mutivalues
final Document doc = new Document();
doc.add(new NumericDocValuesField("sort", totaldocs++));
doc.add(SortedNumericDocValuesField.indexedField("dv", 0L));
doc.add(SortedNumericDocValuesField.indexedField("dv", 0L));
writer.addDocument(doc);
numDocs[i + 1] = 1;
}
}
writer.forceMerge(1);
try (IndexReader reader = writer.getReader()) {
assertEquals(1, reader.leaves().size());
final DocValuesSkipper skipper = reader.leaves().get(0).reader().getDocValuesSkipper("dv");
assertNotNull(skipper);
assertEquals(Arrays.stream(numDocs).sum(), skipper.docCount());
skipper.advance(0);
for (int i = 0; i < gaps; i++) {
assertEquals(0L, skipper.minValue(0));
assertEquals(0L, skipper.maxValue(0));
assertEquals(numDocs[i], skipper.docCount(0));
skipper.advance(skipper.maxDocID(0) + 1);
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, skipper.minDocID(0));
}
}
}
}