mirror of https://github.com/apache/lucene.git
Add levels to DocValues skipper index (#13563)
Adding levels o be able to skip several intervals in one step.
This commit is contained in:
parent
c245ed2fb4
commit
9f991ed07e
|
@ -133,6 +133,8 @@ New Features
|
|||
DocValuesSkipper abstraction. A new flag is added to FieldType.java that configures whether
|
||||
to create a "skip index" for doc values. (Ignacio Vera)
|
||||
|
||||
* GITHUB#13563: Add levels to doc values skip index. (Ignacio Vera)
|
||||
|
||||
Improvements
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -19,9 +19,13 @@ package org.apache.lucene.codecs.lucene90;
|
|||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
|
||||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.NUMERIC_BLOCK_SHIFT;
|
||||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.NUMERIC_BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_INDEX_LEVEL_SHIFT;
|
||||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_INDEX_MAX_LEVEL;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
|
@ -43,7 +47,6 @@ import org.apache.lucene.search.SortedSetSelector;
|
|||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.store.ByteBuffersDataOutput;
|
||||
import org.apache.lucene.store.ByteBuffersIndexOutput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -207,65 +210,130 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
|
|||
maxValue = Math.max(maxValue, value);
|
||||
}
|
||||
|
||||
void accumulate(SkipAccumulator other) {
|
||||
assert minDocID <= other.minDocID && maxDocID < other.maxDocID;
|
||||
maxDocID = other.maxDocID;
|
||||
minValue = Math.min(minValue, other.minValue);
|
||||
maxValue = Math.max(maxValue, other.maxValue);
|
||||
docCount += other.docCount;
|
||||
}
|
||||
|
||||
void nextDoc(int docID) {
|
||||
maxDocID = docID;
|
||||
++docCount;
|
||||
}
|
||||
|
||||
void writeTo(DataOutput output) throws IOException {
|
||||
output.writeInt(maxDocID);
|
||||
output.writeInt(minDocID);
|
||||
output.writeLong(maxValue);
|
||||
output.writeLong(minValue);
|
||||
output.writeInt(docCount);
|
||||
public static SkipAccumulator merge(List<SkipAccumulator> list, int index, int length) {
|
||||
SkipAccumulator acc = new SkipAccumulator(list.get(index).minDocID);
|
||||
for (int i = 0; i < length; i++) {
|
||||
acc.accumulate(list.get(index + i));
|
||||
}
|
||||
return acc;
|
||||
}
|
||||
}
|
||||
|
||||
private void writeSkipIndex(FieldInfo field, DocValuesProducer valuesProducer)
|
||||
throws IOException {
|
||||
assert field.hasDocValuesSkipIndex();
|
||||
// TODO: This disk compression once we introduce levels
|
||||
long start = data.getFilePointer();
|
||||
SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
|
||||
final long start = data.getFilePointer();
|
||||
final SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
|
||||
long globalMaxValue = Long.MIN_VALUE;
|
||||
long globalMinValue = Long.MAX_VALUE;
|
||||
int globalDocCount = 0;
|
||||
int maxDocId = -1;
|
||||
final List<SkipAccumulator> accumulators = new ArrayList<>();
|
||||
SkipAccumulator accumulator = null;
|
||||
int counter = 0;
|
||||
final int maxAccumulators = 1 << (SKIP_INDEX_LEVEL_SHIFT * (SKIP_INDEX_MAX_LEVEL - 1));
|
||||
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||
if (counter == 0) {
|
||||
if (accumulator == null) {
|
||||
accumulator = new SkipAccumulator(doc);
|
||||
accumulators.add(accumulator);
|
||||
}
|
||||
accumulator.nextDoc(doc);
|
||||
for (int i = 0, end = values.docValueCount(); i < end; ++i) {
|
||||
accumulator.accumulate(values.nextValue());
|
||||
}
|
||||
if (++counter == skipIndexIntervalSize) {
|
||||
if (accumulator.docCount == skipIndexIntervalSize) {
|
||||
globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
|
||||
globalMinValue = Math.min(globalMinValue, accumulator.minValue);
|
||||
globalDocCount += accumulator.docCount;
|
||||
maxDocId = accumulator.maxDocID;
|
||||
accumulator.writeTo(data);
|
||||
counter = 0;
|
||||
accumulator = null;
|
||||
if (accumulators.size() == maxAccumulators) {
|
||||
writeLevels(accumulators);
|
||||
accumulators.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (counter > 0) {
|
||||
globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
|
||||
globalMinValue = Math.min(globalMinValue, accumulator.minValue);
|
||||
globalDocCount += accumulator.docCount;
|
||||
maxDocId = accumulator.maxDocID;
|
||||
accumulator.writeTo(data);
|
||||
if (accumulators.isEmpty() == false) {
|
||||
if (accumulator != null) {
|
||||
globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
|
||||
globalMinValue = Math.min(globalMinValue, accumulator.minValue);
|
||||
globalDocCount += accumulator.docCount;
|
||||
maxDocId = accumulator.maxDocID;
|
||||
}
|
||||
writeLevels(accumulators);
|
||||
}
|
||||
meta.writeLong(start); // record the start in meta
|
||||
meta.writeLong(data.getFilePointer() - start); // record the length
|
||||
assert globalDocCount == 0 || globalMaxValue >= globalMinValue;
|
||||
meta.writeLong(globalMaxValue);
|
||||
meta.writeLong(globalMinValue);
|
||||
assert globalDocCount <= maxDocId + 1;
|
||||
meta.writeInt(globalDocCount);
|
||||
meta.writeInt(maxDocId);
|
||||
}
|
||||
|
||||
private void writeLevels(List<SkipAccumulator> accumulators) throws IOException {
|
||||
final List<List<SkipAccumulator>> accumulatorsLevels = new ArrayList<>(SKIP_INDEX_MAX_LEVEL);
|
||||
accumulatorsLevels.add(accumulators);
|
||||
for (int i = 0; i < SKIP_INDEX_MAX_LEVEL - 1; i++) {
|
||||
accumulatorsLevels.add(buildLevel(accumulatorsLevels.get(i)));
|
||||
}
|
||||
int totalAccumulators = accumulators.size();
|
||||
for (int index = 0; index < totalAccumulators; index++) {
|
||||
// compute how many levels we need to write for the current accumulator
|
||||
final int levels = getLevels(index, totalAccumulators);
|
||||
// write the number of levels
|
||||
data.writeByte((byte) levels);
|
||||
// write intervals in reverse order. This is done so we don't
|
||||
// need to read all of them in case of slipping
|
||||
for (int level = levels - 1; level >= 0; level--) {
|
||||
final SkipAccumulator accumulator =
|
||||
accumulatorsLevels.get(level).get(index >> (SKIP_INDEX_LEVEL_SHIFT * level));
|
||||
data.writeInt(accumulator.maxDocID);
|
||||
data.writeInt(accumulator.minDocID);
|
||||
data.writeLong(accumulator.maxValue);
|
||||
data.writeLong(accumulator.minValue);
|
||||
data.writeInt(accumulator.docCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static List<SkipAccumulator> buildLevel(List<SkipAccumulator> accumulators) {
|
||||
final int levelSize = 1 << SKIP_INDEX_LEVEL_SHIFT;
|
||||
final List<SkipAccumulator> collector = new ArrayList<>();
|
||||
for (int i = 0; i < accumulators.size() - levelSize + 1; i += levelSize) {
|
||||
collector.add(SkipAccumulator.merge(accumulators, i, levelSize));
|
||||
}
|
||||
return collector;
|
||||
}
|
||||
|
||||
private static int getLevels(int index, int size) {
|
||||
if (Integer.numberOfTrailingZeros(index) >= SKIP_INDEX_LEVEL_SHIFT) {
|
||||
// TODO: can we do it in constant time rather than linearly with SKIP_INDEX_MAX_LEVEL?
|
||||
final int left = size - index;
|
||||
for (int level = SKIP_INDEX_MAX_LEVEL - 1; level > 0; level--) {
|
||||
final int numberIntervals = 1 << (SKIP_INDEX_LEVEL_SHIFT * level);
|
||||
if (left >= numberIntervals && index % numberIntervals == 0) {
|
||||
return level + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
private long[] writeValues(FieldInfo field, DocValuesProducer valuesProducer, boolean ords)
|
||||
throws IOException {
|
||||
SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
|
||||
|
|
|
@ -194,5 +194,36 @@ public final class Lucene90DocValuesFormat extends DocValuesFormat {
|
|||
static final int TERMS_DICT_REVERSE_INDEX_SIZE = 1 << TERMS_DICT_REVERSE_INDEX_SHIFT;
|
||||
static final int TERMS_DICT_REVERSE_INDEX_MASK = TERMS_DICT_REVERSE_INDEX_SIZE - 1;
|
||||
|
||||
// number of documents in an interval
|
||||
private static final int DEFAULT_SKIP_INDEX_INTERVAL_SIZE = 4096;
|
||||
// bytes on an interval:
|
||||
// * 1 byte : number of levels
|
||||
// * 16 bytes: min / max value,
|
||||
// * 8 bytes: min / max docID
|
||||
// * 4 bytes: number of documents
|
||||
private static final long SKIP_INDEX_INTERVAL_BYTES = 29L;
|
||||
// number of intervals represented as a shift to create a new level, this is 1 << 3 == 8
|
||||
// intervals.
|
||||
static final int SKIP_INDEX_LEVEL_SHIFT = 3;
|
||||
// max number of levels
|
||||
// Increasing this number, it increases how much heap we need at index time.
|
||||
// we currently need (1 * 8 * 8 * 8) = 512 accumulators on heap
|
||||
static final int SKIP_INDEX_MAX_LEVEL = 4;
|
||||
// number of bytes to skip when skipping a level. It does not take into account the
|
||||
// current interval that is being read.
|
||||
static final long[] SKIP_INDEX_JUMP_LENGTH_PER_LEVEL = new long[SKIP_INDEX_MAX_LEVEL];
|
||||
|
||||
static {
|
||||
// Size of the interval minus read bytes (1 byte for level and 4 bytes for maxDocID)
|
||||
SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[0] = SKIP_INDEX_INTERVAL_BYTES - 5L;
|
||||
for (int level = 1; level < SKIP_INDEX_MAX_LEVEL; level++) {
|
||||
// jump from previous level
|
||||
SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level] = SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level - 1];
|
||||
// nodes added by new level
|
||||
SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level] +=
|
||||
(1 << (level * SKIP_INDEX_LEVEL_SHIFT)) * SKIP_INDEX_INTERVAL_BYTES;
|
||||
// remove the byte levels added in the previous level
|
||||
SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level] -= (1 << ((level - 1) * SKIP_INDEX_LEVEL_SHIFT));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
*/
|
||||
package org.apache.lucene.codecs.lucene90;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_INDEX_JUMP_LENGTH_PER_LEVEL;
|
||||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_INDEX_MAX_LEVEL;
|
||||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SHIFT;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -1792,28 +1794,55 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
if (input.length() > 0) {
|
||||
input.prefetch(0, 1);
|
||||
}
|
||||
// TODO: should we write to disk the actual max level for this segment?
|
||||
return new DocValuesSkipper() {
|
||||
int minDocID = -1;
|
||||
int maxDocID = -1;
|
||||
long minValue, maxValue;
|
||||
int docCount;
|
||||
final int[] minDocID = new int[SKIP_INDEX_MAX_LEVEL];
|
||||
final int[] maxDocID = new int[SKIP_INDEX_MAX_LEVEL];
|
||||
|
||||
{
|
||||
for (int i = 0; i < SKIP_INDEX_MAX_LEVEL; i++) {
|
||||
minDocID[i] = maxDocID[i] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
final long[] minValue = new long[SKIP_INDEX_MAX_LEVEL];
|
||||
final long[] maxValue = new long[SKIP_INDEX_MAX_LEVEL];
|
||||
final int[] docCount = new int[SKIP_INDEX_MAX_LEVEL];
|
||||
int levels = 1;
|
||||
|
||||
@Override
|
||||
public void advance(int target) throws IOException {
|
||||
if (target > entry.maxDocId) {
|
||||
minDocID = DocIdSetIterator.NO_MORE_DOCS;
|
||||
maxDocID = DocIdSetIterator.NO_MORE_DOCS;
|
||||
// skipper is exhausted
|
||||
for (int i = 0; i < SKIP_INDEX_MAX_LEVEL; i++) {
|
||||
minDocID[i] = maxDocID[i] = DocIdSetIterator.NO_MORE_DOCS;
|
||||
}
|
||||
} else {
|
||||
// find next interval
|
||||
assert target > maxDocID[0] : "target must be bigger that current interval";
|
||||
while (true) {
|
||||
maxDocID = input.readInt();
|
||||
if (maxDocID >= target) {
|
||||
minDocID = input.readInt();
|
||||
maxValue = input.readLong();
|
||||
minValue = input.readLong();
|
||||
docCount = input.readInt();
|
||||
levels = input.readByte();
|
||||
assert levels <= SKIP_INDEX_MAX_LEVEL && levels > 0
|
||||
: "level out of range [" + levels + "]";
|
||||
boolean valid = true;
|
||||
// check if current interval is competitive or we can jump to the next position
|
||||
for (int level = levels - 1; level >= 0; level--) {
|
||||
if ((maxDocID[level] = input.readInt()) < target) {
|
||||
input.skipBytes(SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level]); // the jump for the level
|
||||
valid = false;
|
||||
break;
|
||||
}
|
||||
minDocID[level] = input.readInt();
|
||||
maxValue[level] = input.readLong();
|
||||
minValue[level] = input.readLong();
|
||||
docCount[level] = input.readInt();
|
||||
}
|
||||
if (valid) {
|
||||
// adjust levels
|
||||
while (levels < SKIP_INDEX_MAX_LEVEL && maxDocID[levels] >= target) {
|
||||
levels++;
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
input.skipBytes(24);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1821,32 +1850,32 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public int numLevels() {
|
||||
return 1;
|
||||
return levels;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int minDocID(int level) {
|
||||
return minDocID;
|
||||
return minDocID[level];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int maxDocID(int level) {
|
||||
return maxDocID;
|
||||
return maxDocID[level];
|
||||
}
|
||||
|
||||
@Override
|
||||
public long minValue(int level) {
|
||||
return minValue;
|
||||
return minValue[level];
|
||||
}
|
||||
|
||||
@Override
|
||||
public long maxValue(int level) {
|
||||
return maxValue;
|
||||
return maxValue[level];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docCount(int level) {
|
||||
return docCount;
|
||||
return docCount[level];
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -3302,17 +3302,17 @@ public final class CheckIndex implements Closeable {
|
|||
if (skipper.maxDocID(0) == NO_MORE_DOCS) {
|
||||
break;
|
||||
}
|
||||
if (skipper.minDocID(0) < doc) {
|
||||
throw new CheckIndexException(
|
||||
"skipper dv iterator for field: "
|
||||
+ fieldName
|
||||
+ " reports wrong minDocID, got "
|
||||
+ skipper.minDocID(0)
|
||||
+ " < "
|
||||
+ doc);
|
||||
}
|
||||
int levels = skipper.numLevels();
|
||||
for (int level = 0; level < levels; level++) {
|
||||
if (skipper.minDocID(level) < doc) {
|
||||
throw new CheckIndexException(
|
||||
"skipper dv iterator for field: "
|
||||
+ fieldName
|
||||
+ " reports wrong minDocID, got "
|
||||
+ skipper.minDocID(level)
|
||||
+ " < "
|
||||
+ doc);
|
||||
}
|
||||
if (skipper.minDocID(level) > skipper.maxDocID(level)) {
|
||||
throw new CheckIndexException(
|
||||
"skipper dv iterator for field: "
|
||||
|
|
|
@ -25,7 +25,8 @@ public class TestLucene90DocValuesFormatVariableSkipInterval extends BaseDocValu
|
|||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return TestUtil.alwaysDocValuesFormat(new Lucene90DocValuesFormat(random().nextInt(2, 1024)));
|
||||
// small interval size to test with many intervals
|
||||
return TestUtil.alwaysDocValuesFormat(new Lucene90DocValuesFormat(random().nextInt(4, 16)));
|
||||
}
|
||||
|
||||
public void testSkipIndexIntervalSize() {
|
||||
|
|
|
@ -773,6 +773,13 @@ public abstract class BaseDocValuesFormatTestCase extends LegacyBaseDocValuesFor
|
|||
maxVal <= skipper.maxValue(0));
|
||||
}
|
||||
docCount += skipper.docCount(0);
|
||||
for (int level = 1; level < skipper.numLevels(); level++) {
|
||||
assertTrue(skipper.minDocID(0) >= skipper.minDocID(level));
|
||||
assertTrue(skipper.maxDocID(0) <= skipper.maxDocID(level));
|
||||
assertTrue(skipper.minValue(0) >= skipper.minValue(level));
|
||||
assertTrue(skipper.maxValue(0) <= skipper.maxValue(level));
|
||||
assertTrue(skipper.docCount(0) < skipper.docCount(level));
|
||||
}
|
||||
}
|
||||
|
||||
assertEquals(docCount, skipper.docCount());
|
||||
|
@ -784,19 +791,23 @@ public abstract class BaseDocValuesFormatTestCase extends LegacyBaseDocValuesFor
|
|||
if (skipper == null) {
|
||||
return;
|
||||
}
|
||||
int nextLevel = 0;
|
||||
while (true) {
|
||||
int doc = random().nextInt(skipper.maxDocID(0), maxDoc + 1) + 1;
|
||||
int doc = random().nextInt(skipper.maxDocID(nextLevel), maxDoc + 1) + 1;
|
||||
skipper.advance(doc);
|
||||
if (skipper.minDocID(0) == NO_MORE_DOCS) {
|
||||
assertEquals(NO_MORE_DOCS, skipper.maxDocID(0));
|
||||
return;
|
||||
}
|
||||
if (iterator.advanceExact(doc)) {
|
||||
assertTrue(iterator.docID() >= skipper.minDocID(0));
|
||||
assertTrue(iterator.docID() <= skipper.maxDocID(0));
|
||||
assertTrue(iterator.minValue() >= skipper.minValue(0));
|
||||
assertTrue(iterator.maxValue() <= skipper.maxValue(0));
|
||||
for (int level = 0; level < skipper.numLevels(); level++) {
|
||||
assertTrue(iterator.docID() >= skipper.minDocID(level));
|
||||
assertTrue(iterator.docID() <= skipper.maxDocID(level));
|
||||
assertTrue(iterator.minValue() >= skipper.minValue(level));
|
||||
assertTrue(iterator.maxValue() <= skipper.maxValue(level));
|
||||
}
|
||||
}
|
||||
nextLevel = random().nextInt(skipper.numLevels());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue