LUCENE-10280: Store BKD blocks with continuous ids more efficiently (#510)

This commit is contained in:
gf2121 2021-12-07 14:26:03 +08:00 committed by iverase
parent 4d48dc87f7
commit 892e324d02
4 changed files with 58 additions and 8 deletions

View File

@ -52,6 +52,8 @@ Improvements
Optimizations
---------------------
* LUCENE-10280: Optimize BKD leaves' doc IDs codec when they are continuous. (Guo Feng)
* LUCENE-10233: Store BKD leaves' doc IDs as bitset in some cases (typically for low cardinality fields
or sorted indices) to speed up addAll. (Guo Feng, Adrien Grand)

View File

@ -35,7 +35,7 @@ public class DocBaseBitSetIterator extends DocIdSetIterator {
throw new IllegalArgumentException("cost must be >= 0, got " + cost);
}
if ((docBase & 63) != 0) {
throw new IllegalArgumentException("docBase need to be a multiple of 64");
throw new IllegalArgumentException("docBase need to be a multiple of 64, got " + docBase);
}
this.bits = bits;
this.length = bits.length() + docBase;

View File

@ -44,13 +44,22 @@ class DocIdsWriter {
}
}
if (strictlySorted && (docIds[start + count - 1] - docIds[start] + 1) <= (count << 4)) {
// Only trigger this optimization when max - min + 1 <= 16 * count in order to avoid expanding
// too much storage.
// A field with lower cardinality will have higher probability to trigger this optimization.
out.writeByte((byte) -1);
writeIdsAsBitSet(docIds, start, count, out);
return;
int min2max = docIds[start + count - 1] - docIds[start] + 1;
if (strictlySorted) {
if (min2max == count) {
// continuous ids, typically happens when segment is sorted
out.writeByte((byte) -2);
out.writeVInt(docIds[start]);
return;
} else if (min2max <= (count << 4)) {
assert min2max > count : "min2max: " + min2max + ", count: " + count;
// Only trigger bitset optimization when max - min + 1 <= 16 * count in order to avoid
// expanding too much storage.
// A field with lower cardinality will have higher probability to trigger this optimization.
out.writeByte((byte) -1);
writeIdsAsBitSet(docIds, start, count, out);
return;
}
}
if (sorted) {
out.writeByte((byte) 0);
@ -139,6 +148,9 @@ class DocIdsWriter {
static void readInts(IndexInput in, int count, int[] docIDs) throws IOException {
final int bpv = in.readByte();
switch (bpv) {
case -2:
readContinuousIds(in, count, docIDs);
break;
case -1:
readBitSet(in, count, docIDs);
break;
@ -165,6 +177,13 @@ class DocIdsWriter {
return new DocBaseBitSetIterator(bitSet, count, offsetWords << 6);
}
private static void readContinuousIds(IndexInput in, int count, int[] docIDs) throws IOException {
int start = in.readVInt();
for (int i = 0; i < count; i++) {
docIDs[i] = start + i;
}
}
private static void readBitSet(IndexInput in, int count, int[] docIDs) throws IOException {
DocIdSetIterator iterator = readBitSetIterator(in, count);
int docId, pos = 0;
@ -215,6 +234,9 @@ class DocIdsWriter {
static void readInts(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
final int bpv = in.readByte();
switch (bpv) {
case -2:
readContinuousIds(in, count, visitor);
break;
case -1:
readBitSet(in, count, visitor);
break;
@ -274,4 +296,15 @@ class DocIdsWriter {
DocIdSetIterator bitSetIterator = readBitSetIterator(in, count);
visitor.visit(bitSetIterator);
}
private static void readContinuousIds(IndexInput in, int count, IntersectVisitor visitor)
throws IOException {
int start = in.readVInt();
int extra = start & 63;
int offset = start - extra;
int numBits = count + extra;
FixedBitSet bitSet = new FixedBitSet(numBits);
bitSet.set(extra, numBits);
visitor.visit(new DocBaseBitSetIterator(bitSet, count, offset));
}
}

View File

@ -76,6 +76,21 @@ public class TestDocIdsWriter extends LuceneTestCase {
}
}
public void testContinuousIds() throws Exception {
int numIters = atLeast(100);
try (Directory dir = newDirectory()) {
for (int iter = 0; iter < numIters; ++iter) {
int size = 1 + random().nextInt(5000);
int[] docIDs = new int[size];
int start = random().nextInt(1000000);
for (int i = 0; i < docIDs.length; i++) {
docIDs[i] = start + i;
}
test(dir, docIDs);
}
}
}
private void test(Directory dir, int[] ints) throws Exception {
final long len;
try (IndexOutput out = dir.createOutput("tmp", IOContext.DEFAULT)) {