mirror of https://github.com/apache/lucene.git
LUCENE-10280: Store BKD blocks with continuous ids more efficiently (#510)
This commit is contained in:
parent
4d48dc87f7
commit
892e324d02
|
@ -52,6 +52,8 @@ Improvements
|
||||||
Optimizations
|
Optimizations
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
* LUCENE-10280: Optimize BKD leaves' doc IDs codec when they are continuous. (Guo Feng)
|
||||||
|
|
||||||
* LUCENE-10233: Store BKD leaves' doc IDs as bitset in some cases (typically for low cardinality fields
|
* LUCENE-10233: Store BKD leaves' doc IDs as bitset in some cases (typically for low cardinality fields
|
||||||
or sorted indices) to speed up addAll. (Guo Feng, Adrien Grand)
|
or sorted indices) to speed up addAll. (Guo Feng, Adrien Grand)
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ public class DocBaseBitSetIterator extends DocIdSetIterator {
|
||||||
throw new IllegalArgumentException("cost must be >= 0, got " + cost);
|
throw new IllegalArgumentException("cost must be >= 0, got " + cost);
|
||||||
}
|
}
|
||||||
if ((docBase & 63) != 0) {
|
if ((docBase & 63) != 0) {
|
||||||
throw new IllegalArgumentException("docBase need to be a multiple of 64");
|
throw new IllegalArgumentException("docBase need to be a multiple of 64, got " + docBase);
|
||||||
}
|
}
|
||||||
this.bits = bits;
|
this.bits = bits;
|
||||||
this.length = bits.length() + docBase;
|
this.length = bits.length() + docBase;
|
||||||
|
|
|
@ -44,14 +44,23 @@ class DocIdsWriter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (strictlySorted && (docIds[start + count - 1] - docIds[start] + 1) <= (count << 4)) {
|
int min2max = docIds[start + count - 1] - docIds[start] + 1;
|
||||||
// Only trigger this optimization when max - min + 1 <= 16 * count in order to avoid expanding
|
if (strictlySorted) {
|
||||||
// too much storage.
|
if (min2max == count) {
|
||||||
|
// continuous ids, typically happens when segment is sorted
|
||||||
|
out.writeByte((byte) -2);
|
||||||
|
out.writeVInt(docIds[start]);
|
||||||
|
return;
|
||||||
|
} else if (min2max <= (count << 4)) {
|
||||||
|
assert min2max > count : "min2max: " + min2max + ", count: " + count;
|
||||||
|
// Only trigger bitset optimization when max - min + 1 <= 16 * count in order to avoid
|
||||||
|
// expanding too much storage.
|
||||||
// A field with lower cardinality will have higher probability to trigger this optimization.
|
// A field with lower cardinality will have higher probability to trigger this optimization.
|
||||||
out.writeByte((byte) -1);
|
out.writeByte((byte) -1);
|
||||||
writeIdsAsBitSet(docIds, start, count, out);
|
writeIdsAsBitSet(docIds, start, count, out);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
if (sorted) {
|
if (sorted) {
|
||||||
out.writeByte((byte) 0);
|
out.writeByte((byte) 0);
|
||||||
int previous = 0;
|
int previous = 0;
|
||||||
|
@ -139,6 +148,9 @@ class DocIdsWriter {
|
||||||
static void readInts(IndexInput in, int count, int[] docIDs) throws IOException {
|
static void readInts(IndexInput in, int count, int[] docIDs) throws IOException {
|
||||||
final int bpv = in.readByte();
|
final int bpv = in.readByte();
|
||||||
switch (bpv) {
|
switch (bpv) {
|
||||||
|
case -2:
|
||||||
|
readContinuousIds(in, count, docIDs);
|
||||||
|
break;
|
||||||
case -1:
|
case -1:
|
||||||
readBitSet(in, count, docIDs);
|
readBitSet(in, count, docIDs);
|
||||||
break;
|
break;
|
||||||
|
@ -165,6 +177,13 @@ class DocIdsWriter {
|
||||||
return new DocBaseBitSetIterator(bitSet, count, offsetWords << 6);
|
return new DocBaseBitSetIterator(bitSet, count, offsetWords << 6);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void readContinuousIds(IndexInput in, int count, int[] docIDs) throws IOException {
|
||||||
|
int start = in.readVInt();
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
docIDs[i] = start + i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static void readBitSet(IndexInput in, int count, int[] docIDs) throws IOException {
|
private static void readBitSet(IndexInput in, int count, int[] docIDs) throws IOException {
|
||||||
DocIdSetIterator iterator = readBitSetIterator(in, count);
|
DocIdSetIterator iterator = readBitSetIterator(in, count);
|
||||||
int docId, pos = 0;
|
int docId, pos = 0;
|
||||||
|
@ -215,6 +234,9 @@ class DocIdsWriter {
|
||||||
static void readInts(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
|
static void readInts(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
|
||||||
final int bpv = in.readByte();
|
final int bpv = in.readByte();
|
||||||
switch (bpv) {
|
switch (bpv) {
|
||||||
|
case -2:
|
||||||
|
readContinuousIds(in, count, visitor);
|
||||||
|
break;
|
||||||
case -1:
|
case -1:
|
||||||
readBitSet(in, count, visitor);
|
readBitSet(in, count, visitor);
|
||||||
break;
|
break;
|
||||||
|
@ -274,4 +296,15 @@ class DocIdsWriter {
|
||||||
DocIdSetIterator bitSetIterator = readBitSetIterator(in, count);
|
DocIdSetIterator bitSetIterator = readBitSetIterator(in, count);
|
||||||
visitor.visit(bitSetIterator);
|
visitor.visit(bitSetIterator);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void readContinuousIds(IndexInput in, int count, IntersectVisitor visitor)
|
||||||
|
throws IOException {
|
||||||
|
int start = in.readVInt();
|
||||||
|
int extra = start & 63;
|
||||||
|
int offset = start - extra;
|
||||||
|
int numBits = count + extra;
|
||||||
|
FixedBitSet bitSet = new FixedBitSet(numBits);
|
||||||
|
bitSet.set(extra, numBits);
|
||||||
|
visitor.visit(new DocBaseBitSetIterator(bitSet, count, offset));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -76,6 +76,21 @@ public class TestDocIdsWriter extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testContinuousIds() throws Exception {
|
||||||
|
int numIters = atLeast(100);
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
for (int iter = 0; iter < numIters; ++iter) {
|
||||||
|
int size = 1 + random().nextInt(5000);
|
||||||
|
int[] docIDs = new int[size];
|
||||||
|
int start = random().nextInt(1000000);
|
||||||
|
for (int i = 0; i < docIDs.length; i++) {
|
||||||
|
docIDs[i] = start + i;
|
||||||
|
}
|
||||||
|
test(dir, docIDs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void test(Directory dir, int[] ints) throws Exception {
|
private void test(Directory dir, int[] ints) throws Exception {
|
||||||
final long len;
|
final long len;
|
||||||
try (IndexOutput out = dir.createOutput("tmp", IOContext.DEFAULT)) {
|
try (IndexOutput out = dir.createOutput("tmp", IOContext.DEFAULT)) {
|
||||||
|
|
Loading…
Reference in New Issue