mirror of https://github.com/apache/lucene.git
Align doc value skipper interval boundaries when an interval contains a constant value (#13597)
keep adding documents to an skipper interval while it is dense and single valued.
This commit is contained in:
parent
cc3b412183
commit
7709f575ef
|
@ -135,6 +135,9 @@ New Features
|
|||
|
||||
* GITHUB#13563: Add levels to doc values skip index. (Ignacio Vera)
|
||||
|
||||
* GITHUB#13597: Align doc value skipper interval boundaries when an interval contains a constant
|
||||
value. (Ignacio Vera)
|
||||
|
||||
Improvements
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -205,6 +205,20 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
|
|||
docCount = 0;
|
||||
}
|
||||
|
||||
boolean isDone(int skipIndexIntervalSize, int valueCount, long nextValue, int nextDoc) {
|
||||
if (docCount < skipIndexIntervalSize) {
|
||||
return false;
|
||||
}
|
||||
// Once we reach the interval size, we will keep accepting documents if
|
||||
// - next doc value is not a multi-value
|
||||
// - current accumulator only contains a single value and next value is the same value
|
||||
// - the accumulator is dense and the next doc keeps the density (no gaps)
|
||||
return valueCount > 1
|
||||
|| minValue != maxValue
|
||||
|| minValue != nextValue
|
||||
|| docCount != nextDoc - minDocID;
|
||||
}
|
||||
|
||||
void accumulate(long value) {
|
||||
minValue = Math.min(minValue, value);
|
||||
maxValue = Math.max(maxValue, value);
|
||||
|
@ -245,15 +259,9 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
|
|||
SkipAccumulator accumulator = null;
|
||||
final int maxAccumulators = 1 << (SKIP_INDEX_LEVEL_SHIFT * (SKIP_INDEX_MAX_LEVEL - 1));
|
||||
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||
if (accumulator == null) {
|
||||
accumulator = new SkipAccumulator(doc);
|
||||
accumulators.add(accumulator);
|
||||
}
|
||||
accumulator.nextDoc(doc);
|
||||
for (int i = 0, end = values.docValueCount(); i < end; ++i) {
|
||||
accumulator.accumulate(values.nextValue());
|
||||
}
|
||||
if (accumulator.docCount == skipIndexIntervalSize) {
|
||||
final long firstValue = values.nextValue();
|
||||
if (accumulator != null
|
||||
&& accumulator.isDone(skipIndexIntervalSize, values.docValueCount(), firstValue, doc)) {
|
||||
globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
|
||||
globalMinValue = Math.min(globalMinValue, accumulator.minValue);
|
||||
globalDocCount += accumulator.docCount;
|
||||
|
@ -264,15 +272,22 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
|
|||
accumulators.clear();
|
||||
}
|
||||
}
|
||||
if (accumulator == null) {
|
||||
accumulator = new SkipAccumulator(doc);
|
||||
accumulators.add(accumulator);
|
||||
}
|
||||
accumulator.nextDoc(doc);
|
||||
accumulator.accumulate(firstValue);
|
||||
for (int i = 1, end = values.docValueCount(); i < end; ++i) {
|
||||
accumulator.accumulate(values.nextValue());
|
||||
}
|
||||
}
|
||||
|
||||
if (accumulators.isEmpty() == false) {
|
||||
if (accumulator != null) {
|
||||
globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
|
||||
globalMinValue = Math.min(globalMinValue, accumulator.minValue);
|
||||
globalDocCount += accumulator.docCount;
|
||||
maxDocId = accumulator.maxDocID;
|
||||
}
|
||||
globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
|
||||
globalMinValue = Math.min(globalMinValue, accumulator.minValue);
|
||||
globalDocCount += accumulator.docCount;
|
||||
maxDocId = accumulator.maxDocID;
|
||||
writeLevels(accumulators);
|
||||
}
|
||||
meta.writeLong(start); // record the start in meta
|
||||
|
|
|
@ -16,11 +16,24 @@
|
|||
*/
|
||||
package org.apache.lucene.codecs.lucene90;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.NumericDocValuesField;
|
||||
import org.apache.lucene.document.SortedNumericDocValuesField;
|
||||
import org.apache.lucene.index.DocValuesSkipper;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.tests.index.BaseDocValuesFormatTestCase;
|
||||
import org.apache.lucene.tests.index.RandomIndexWriter;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
|
||||
/** Tests Lucene90DocValuesFormat */
|
||||
/** Tests Lucene90DocValuesFormat with custom skipper interval size */
|
||||
public class TestLucene90DocValuesFormatVariableSkipInterval extends BaseDocValuesFormatTestCase {
|
||||
|
||||
@Override
|
||||
|
@ -36,4 +49,159 @@ public class TestLucene90DocValuesFormatVariableSkipInterval extends BaseDocValu
|
|||
() -> new Lucene90DocValuesFormat(random().nextInt(Integer.MIN_VALUE, 2)));
|
||||
assertTrue(ex.getMessage().contains("skipIndexIntervalSize must be > 1"));
|
||||
}
|
||||
|
||||
public void testSkipperAllEqualValue() throws IOException {
|
||||
final IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec());
|
||||
try (Directory directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory, config)) {
|
||||
final int numDocs = atLeast(100);
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
final Document doc = new Document();
|
||||
doc.add(NumericDocValuesField.indexedField("dv", 0L));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.forceMerge(1);
|
||||
try (IndexReader reader = writer.getReader()) {
|
||||
assertEquals(1, reader.leaves().size());
|
||||
final DocValuesSkipper skipper = reader.leaves().get(0).reader().getDocValuesSkipper("dv");
|
||||
assertNotNull(skipper);
|
||||
skipper.advance(0);
|
||||
assertEquals(0L, skipper.minValue(0));
|
||||
assertEquals(0L, skipper.maxValue(0));
|
||||
assertEquals(numDocs, skipper.docCount(0));
|
||||
skipper.advance(skipper.maxDocID(0) + 1);
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, skipper.minDocID(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// break on different value
|
||||
public void testSkipperFewValuesSorted() throws IOException {
|
||||
final IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec());
|
||||
boolean reverse = random().nextBoolean();
|
||||
config.setIndexSort(new Sort(new SortField("dv", SortField.Type.LONG, reverse)));
|
||||
try (Directory directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory, config)) {
|
||||
final int intervals = random().nextInt(2, 10);
|
||||
final int[] numDocs = new int[intervals];
|
||||
for (int i = 0; i < intervals; i++) {
|
||||
numDocs[i] = random().nextInt(10) + 16;
|
||||
for (int j = 0; j < numDocs[i]; j++) {
|
||||
final Document doc = new Document();
|
||||
doc.add(NumericDocValuesField.indexedField("dv", i));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
}
|
||||
writer.forceMerge(1);
|
||||
try (IndexReader reader = writer.getReader()) {
|
||||
assertEquals(1, reader.leaves().size());
|
||||
final DocValuesSkipper skipper = reader.leaves().get(0).reader().getDocValuesSkipper("dv");
|
||||
assertNotNull(skipper);
|
||||
assertEquals(Arrays.stream(numDocs).sum(), skipper.docCount());
|
||||
skipper.advance(0);
|
||||
if (reverse) {
|
||||
for (int i = intervals - 1; i >= 0; i--) {
|
||||
assertEquals(i, skipper.minValue(0));
|
||||
assertEquals(i, skipper.maxValue(0));
|
||||
assertEquals(numDocs[i], skipper.docCount(0));
|
||||
skipper.advance(skipper.maxDocID(0) + 1);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < intervals; i++) {
|
||||
assertEquals(i, skipper.minValue(0));
|
||||
assertEquals(i, skipper.maxValue(0));
|
||||
assertEquals(numDocs[i], skipper.docCount(0));
|
||||
skipper.advance(skipper.maxDocID(0) + 1);
|
||||
}
|
||||
}
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, skipper.minDocID(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// break on empty doc values
|
||||
public void testSkipperAllEqualValueWithGaps() throws IOException {
|
||||
final IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec());
|
||||
config.setIndexSort(new Sort(new SortField("sort", SortField.Type.LONG, false)));
|
||||
try (Directory directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory, config)) {
|
||||
final int gaps = random().nextInt(2, 10);
|
||||
final int[] numDocs = new int[gaps];
|
||||
long totaldocs = 0;
|
||||
for (int i = 0; i < gaps; i++) {
|
||||
numDocs[i] = random().nextInt(10) + 16;
|
||||
for (int j = 0; j < numDocs[i]; j++) {
|
||||
final Document doc = new Document();
|
||||
doc.add(new NumericDocValuesField("sort", totaldocs++));
|
||||
doc.add(SortedNumericDocValuesField.indexedField("dv", 0L));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
// add doc with empty "dv"
|
||||
final Document doc = new Document();
|
||||
doc.add(new NumericDocValuesField("sort", totaldocs++));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.forceMerge(1);
|
||||
try (IndexReader reader = writer.getReader()) {
|
||||
assertEquals(1, reader.leaves().size());
|
||||
final DocValuesSkipper skipper = reader.leaves().get(0).reader().getDocValuesSkipper("dv");
|
||||
assertNotNull(skipper);
|
||||
assertEquals(Arrays.stream(numDocs).sum(), skipper.docCount());
|
||||
skipper.advance(0);
|
||||
for (int i = 0; i < gaps; i++) {
|
||||
assertEquals(0L, skipper.minValue(0));
|
||||
assertEquals(0L, skipper.maxValue(0));
|
||||
assertEquals(numDocs[i], skipper.docCount(0));
|
||||
skipper.advance(skipper.maxDocID(0) + 1);
|
||||
}
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, skipper.minDocID(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// break on multi-values
|
||||
public void testSkipperAllEqualValueWithMultiValues() throws IOException {
|
||||
final IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec());
|
||||
config.setIndexSort(new Sort(new SortField("sort", SortField.Type.LONG, false)));
|
||||
try (Directory directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory, config)) {
|
||||
final int gaps = random().nextInt(2, 10);
|
||||
final int[] numDocs = new int[gaps];
|
||||
long totaldocs = 0;
|
||||
for (int i = 0; i < gaps; i++) {
|
||||
int docs = random().nextInt(10) + 16;
|
||||
numDocs[i] += docs;
|
||||
for (int j = 0; j < docs; j++) {
|
||||
final Document doc = new Document();
|
||||
doc.add(new NumericDocValuesField("sort", totaldocs++));
|
||||
doc.add(SortedNumericDocValuesField.indexedField("dv", 0L));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
if (i != gaps - 1) {
|
||||
// add doc with mutivalues
|
||||
final Document doc = new Document();
|
||||
doc.add(new NumericDocValuesField("sort", totaldocs++));
|
||||
doc.add(SortedNumericDocValuesField.indexedField("dv", 0L));
|
||||
doc.add(SortedNumericDocValuesField.indexedField("dv", 0L));
|
||||
writer.addDocument(doc);
|
||||
numDocs[i + 1] = 1;
|
||||
}
|
||||
}
|
||||
writer.forceMerge(1);
|
||||
try (IndexReader reader = writer.getReader()) {
|
||||
assertEquals(1, reader.leaves().size());
|
||||
final DocValuesSkipper skipper = reader.leaves().get(0).reader().getDocValuesSkipper("dv");
|
||||
assertNotNull(skipper);
|
||||
assertEquals(Arrays.stream(numDocs).sum(), skipper.docCount());
|
||||
skipper.advance(0);
|
||||
for (int i = 0; i < gaps; i++) {
|
||||
assertEquals(0L, skipper.minValue(0));
|
||||
assertEquals(0L, skipper.maxValue(0));
|
||||
assertEquals(numDocs[i], skipper.docCount(0));
|
||||
skipper.advance(skipper.maxDocID(0) + 1);
|
||||
}
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, skipper.minDocID(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue