LUCENE-10280: Store BKD blocks with continuous ids more efficiently (#510)

This commit is contained in:
gf2121 2021-12-07 14:26:03 +08:00 committed by iverase
parent 4d48dc87f7
commit 892e324d02
4 changed files with 58 additions and 8 deletions

View File

@ -52,6 +52,8 @@ Improvements
Optimizations Optimizations
--------------------- ---------------------
* LUCENE-10280: Optimize BKD leaves' doc IDs codec when they are continuous. (Guo Feng)
* LUCENE-10233: Store BKD leaves' doc IDs as bitset in some cases (typically for low cardinality fields * LUCENE-10233: Store BKD leaves' doc IDs as bitset in some cases (typically for low cardinality fields
or sorted indices) to speed up addAll. (Guo Feng, Adrien Grand) or sorted indices) to speed up addAll. (Guo Feng, Adrien Grand)

View File

@ -35,7 +35,7 @@ public class DocBaseBitSetIterator extends DocIdSetIterator {
throw new IllegalArgumentException("cost must be >= 0, got " + cost); throw new IllegalArgumentException("cost must be >= 0, got " + cost);
} }
if ((docBase & 63) != 0) { if ((docBase & 63) != 0) {
throw new IllegalArgumentException("docBase need to be a multiple of 64"); throw new IllegalArgumentException("docBase need to be a multiple of 64, got " + docBase);
} }
this.bits = bits; this.bits = bits;
this.length = bits.length() + docBase; this.length = bits.length() + docBase;

View File

@ -44,14 +44,23 @@ class DocIdsWriter {
} }
} }
if (strictlySorted && (docIds[start + count - 1] - docIds[start] + 1) <= (count << 4)) { int min2max = docIds[start + count - 1] - docIds[start] + 1;
// Only trigger this optimization when max - min + 1 <= 16 * count in order to avoid expanding if (strictlySorted) {
// too much storage. if (min2max == count) {
// continuous ids, typically happens when segment is sorted
out.writeByte((byte) -2);
out.writeVInt(docIds[start]);
return;
} else if (min2max <= (count << 4)) {
assert min2max > count : "min2max: " + min2max + ", count: " + count;
// Only trigger bitset optimization when max - min + 1 <= 16 * count in order to avoid
// expanding too much storage.
// A field with lower cardinality will have higher probability to trigger this optimization. // A field with lower cardinality will have higher probability to trigger this optimization.
out.writeByte((byte) -1); out.writeByte((byte) -1);
writeIdsAsBitSet(docIds, start, count, out); writeIdsAsBitSet(docIds, start, count, out);
return; return;
} }
}
if (sorted) { if (sorted) {
out.writeByte((byte) 0); out.writeByte((byte) 0);
int previous = 0; int previous = 0;
@ -139,6 +148,9 @@ class DocIdsWriter {
static void readInts(IndexInput in, int count, int[] docIDs) throws IOException { static void readInts(IndexInput in, int count, int[] docIDs) throws IOException {
final int bpv = in.readByte(); final int bpv = in.readByte();
switch (bpv) { switch (bpv) {
case -2:
readContinuousIds(in, count, docIDs);
break;
case -1: case -1:
readBitSet(in, count, docIDs); readBitSet(in, count, docIDs);
break; break;
@ -165,6 +177,13 @@ class DocIdsWriter {
return new DocBaseBitSetIterator(bitSet, count, offsetWords << 6); return new DocBaseBitSetIterator(bitSet, count, offsetWords << 6);
} }
private static void readContinuousIds(IndexInput in, int count, int[] docIDs) throws IOException {
int start = in.readVInt();
for (int i = 0; i < count; i++) {
docIDs[i] = start + i;
}
}
private static void readBitSet(IndexInput in, int count, int[] docIDs) throws IOException { private static void readBitSet(IndexInput in, int count, int[] docIDs) throws IOException {
DocIdSetIterator iterator = readBitSetIterator(in, count); DocIdSetIterator iterator = readBitSetIterator(in, count);
int docId, pos = 0; int docId, pos = 0;
@ -215,6 +234,9 @@ class DocIdsWriter {
static void readInts(IndexInput in, int count, IntersectVisitor visitor) throws IOException { static void readInts(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
final int bpv = in.readByte(); final int bpv = in.readByte();
switch (bpv) { switch (bpv) {
case -2:
readContinuousIds(in, count, visitor);
break;
case -1: case -1:
readBitSet(in, count, visitor); readBitSet(in, count, visitor);
break; break;
@ -274,4 +296,15 @@ class DocIdsWriter {
DocIdSetIterator bitSetIterator = readBitSetIterator(in, count); DocIdSetIterator bitSetIterator = readBitSetIterator(in, count);
visitor.visit(bitSetIterator); visitor.visit(bitSetIterator);
} }
private static void readContinuousIds(IndexInput in, int count, IntersectVisitor visitor)
throws IOException {
int start = in.readVInt();
int extra = start & 63;
int offset = start - extra;
int numBits = count + extra;
FixedBitSet bitSet = new FixedBitSet(numBits);
bitSet.set(extra, numBits);
visitor.visit(new DocBaseBitSetIterator(bitSet, count, offset));
}
} }

View File

@ -76,6 +76,21 @@ public class TestDocIdsWriter extends LuceneTestCase {
} }
} }
public void testContinuousIds() throws Exception {
int numIters = atLeast(100);
try (Directory dir = newDirectory()) {
for (int iter = 0; iter < numIters; ++iter) {
int size = 1 + random().nextInt(5000);
int[] docIDs = new int[size];
int start = random().nextInt(1000000);
for (int i = 0; i < docIDs.length; i++) {
docIDs[i] = start + i;
}
test(dir, docIDs);
}
}
}
private void test(Directory dir, int[] ints) throws Exception { private void test(Directory dir, int[] ints) throws Exception {
final long len; final long len;
try (IndexOutput out = dir.createOutput("tmp", IOContext.DEFAULT)) { try (IndexOutput out = dir.createOutput("tmp", IOContext.DEFAULT)) {