mirror of
https://github.com/apache/lucene.git
synced 2025-02-09 03:25:15 +00:00
LUCENE-7351: Doc id compression for points.
This commit is contained in:
parent
4eaf66acb6
commit
01de73bc0a
@ -64,6 +64,8 @@ Optimizations
|
||||
|
||||
* LUCENE-7356: SearchGroup tweaks. (Christine Poerschke)
|
||||
|
||||
* LUCENE-7351: Doc id compression for points. (Adrien Grand)
|
||||
|
||||
Other
|
||||
|
||||
* LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien
|
||||
|
@ -44,11 +44,12 @@ public class BKDReader implements Accountable {
|
||||
final byte[] maxPackedValue;
|
||||
final long pointCount;
|
||||
final int docCount;
|
||||
final int version;
|
||||
protected final int packedBytesLength;
|
||||
|
||||
/** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */
|
||||
public BKDReader(IndexInput in) throws IOException {
|
||||
CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_START);
|
||||
version = CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT);
|
||||
numDims = in.readVInt();
|
||||
maxPointsInLeafNode = in.readVInt();
|
||||
bytesPerDim = in.readVInt();
|
||||
@ -141,6 +142,7 @@ public class BKDReader implements Accountable {
|
||||
this.maxPackedValue = maxPackedValue;
|
||||
this.pointCount = pointCount;
|
||||
this.docCount = docCount;
|
||||
this.version = BKDWriter.VERSION_CURRENT;
|
||||
assert minPackedValue.length == packedBytesLength;
|
||||
assert maxPackedValue.length == packedBytesLength;
|
||||
}
|
||||
@ -314,13 +316,15 @@ public class BKDReader implements Accountable {
|
||||
protected void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
|
||||
// Leaf node
|
||||
in.seek(blockFP);
|
||||
|
||||
|
||||
// How many points are stored in this leaf cell:
|
||||
int count = in.readVInt();
|
||||
visitor.grow(count);
|
||||
|
||||
for(int i=0;i<count;i++) {
|
||||
visitor.visit(in.readInt());
|
||||
if (version < BKDWriter.VERSION_COMPRESSED_DOC_IDS) {
|
||||
DocIdsWriter.readInts32(in, count, visitor);
|
||||
} else {
|
||||
DocIdsWriter.readInts(in, count, visitor);
|
||||
}
|
||||
}
|
||||
|
||||
@ -330,8 +334,10 @@ public class BKDReader implements Accountable {
|
||||
// How many points are stored in this leaf cell:
|
||||
int count = in.readVInt();
|
||||
|
||||
for(int i=0;i<count;i++) {
|
||||
docIDs[i] = in.readInt();
|
||||
if (version < BKDWriter.VERSION_COMPRESSED_DOC_IDS) {
|
||||
DocIdsWriter.readInts32(in, count, docIDs);
|
||||
} else {
|
||||
DocIdsWriter.readInts(in, count, docIDs);
|
||||
}
|
||||
|
||||
return count;
|
||||
|
@ -77,7 +77,8 @@ public class BKDWriter implements Closeable {
|
||||
|
||||
public static final String CODEC_NAME = "BKD";
|
||||
public static final int VERSION_START = 0;
|
||||
public static final int VERSION_CURRENT = VERSION_START;
|
||||
public static final int VERSION_COMPRESSED_DOC_IDS = 1;
|
||||
public static final int VERSION_CURRENT = VERSION_COMPRESSED_DOC_IDS;
|
||||
|
||||
/** How many bytes each docs takes in the fixed-width offline format */
|
||||
private final int bytesPerDoc;
|
||||
@ -892,10 +893,7 @@ public class BKDWriter implements Closeable {
|
||||
protected void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
|
||||
assert count > 0: "maxPointsInLeafNode=" + maxPointsInLeafNode;
|
||||
out.writeVInt(count);
|
||||
|
||||
for (int i=0;i<count;i++) {
|
||||
out.writeInt(docIDs[start + i]);
|
||||
}
|
||||
DocIdsWriter.writeDocIds(docIDs, start, count, out);
|
||||
}
|
||||
|
||||
protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes, int offset) throws IOException {
|
||||
|
@ -0,0 +1,170 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.util.bkd;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.PointValues.IntersectVisitor;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
|
||||
class DocIdsWriter {
|
||||
|
||||
private DocIdsWriter() {}
|
||||
|
||||
static void writeDocIds(int[] docIds, int start, int count, IndexOutput out) throws IOException {
|
||||
// docs can be sorted either when all docs in a block have the same value
|
||||
// or when a segment is sorted
|
||||
boolean sorted = true;
|
||||
for (int i = 1; i < count; ++i) {
|
||||
if (docIds[start + i - 1] > docIds[start + i]) {
|
||||
sorted = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (sorted) {
|
||||
out.writeByte((byte) 0);
|
||||
int previous = 0;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
int doc = docIds[start + i];
|
||||
out.writeVInt(doc - previous);
|
||||
previous = doc;
|
||||
}
|
||||
} else {
|
||||
long max = 0;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
max |= Integer.toUnsignedLong(docIds[start + i]);
|
||||
}
|
||||
if (max <= 0xffffff) {
|
||||
out.writeByte((byte) 24);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
out.writeShort((short) (docIds[start + i] >>> 8));
|
||||
out.writeByte((byte) docIds[start + i]);
|
||||
}
|
||||
} else {
|
||||
out.writeByte((byte) 32);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
out.writeInt(docIds[start + i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Read {@code count} integers into {@code docIDs}. */
|
||||
static void readInts(IndexInput in, int count, int[] docIDs) throws IOException {
|
||||
final int bpv = in.readByte();
|
||||
switch (bpv) {
|
||||
case 0:
|
||||
readDeltaVInts(in, count, docIDs);
|
||||
break;
|
||||
case 32:
|
||||
readInts32(in, count, docIDs);
|
||||
break;
|
||||
case 24:
|
||||
readInts24(in, count, docIDs);
|
||||
break;
|
||||
default:
|
||||
throw new IOException("Unsupported number of bits per value: " + bpv);
|
||||
}
|
||||
}
|
||||
|
||||
private static void readDeltaVInts(IndexInput in, int count, int[] docIDs) throws IOException {
|
||||
int doc = 0;
|
||||
for (int i = 0; i < count; i++) {
|
||||
doc += in.readVInt();
|
||||
docIDs[i] = doc;
|
||||
}
|
||||
}
|
||||
|
||||
static <T> void readInts32(IndexInput in, int count, int[] docIDs) throws IOException {
|
||||
for (int i = 0; i < count; i++) {
|
||||
docIDs[i] = in.readInt();
|
||||
}
|
||||
}
|
||||
|
||||
private static void readInts24(IndexInput in, int count, int[] docIDs) throws IOException {
|
||||
int i;
|
||||
for (i = 0; i < count - 7; i += 8) {
|
||||
long l1 = in.readLong();
|
||||
long l2 = in.readLong();
|
||||
long l3 = in.readLong();
|
||||
docIDs[i] = (int) (l1 >>> 40);
|
||||
docIDs[i+1] = (int) (l1 >>> 16) & 0xffffff;
|
||||
docIDs[i+2] = (int) (((l1 & 0xffff) << 8) | (l2 >>> 56));
|
||||
docIDs[i+3] = (int) (l2 >>> 32) & 0xffffff;
|
||||
docIDs[i+4] = (int) (l2 >>> 8) & 0xffffff;
|
||||
docIDs[i+5] = (int) (((l2 & 0xff) << 16) | (l3 >>> 48));
|
||||
docIDs[i+6] = (int) (l3 >>> 24) & 0xffffff;
|
||||
docIDs[i+7] = (int) l3 & 0xffffff;
|
||||
}
|
||||
for (; i < count; ++i) {
|
||||
docIDs[i] = (Short.toUnsignedInt(in.readShort()) << 8) | Byte.toUnsignedInt(in.readByte());
|
||||
}
|
||||
}
|
||||
|
||||
/** Read {@code count} integers and feed the result directly to {@link IntersectVisitor#visit(int)}. */
|
||||
static void readInts(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
|
||||
final int bpv = in.readByte();
|
||||
switch (bpv) {
|
||||
case 0:
|
||||
readDeltaVInts(in, count, visitor);
|
||||
break;
|
||||
case 32:
|
||||
readInts32(in, count, visitor);
|
||||
break;
|
||||
case 24:
|
||||
readInts24(in, count, visitor);
|
||||
break;
|
||||
default:
|
||||
throw new IOException("Unsupported number of bits per value: " + bpv);
|
||||
}
|
||||
}
|
||||
|
||||
private static void readDeltaVInts(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
|
||||
int doc = 0;
|
||||
for (int i = 0; i < count; i++) {
|
||||
doc += in.readVInt();
|
||||
visitor.visit(doc);
|
||||
}
|
||||
}
|
||||
|
||||
static void readInts32(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
|
||||
for (int i = 0; i < count; i++) {
|
||||
visitor.visit(in.readInt());
|
||||
}
|
||||
}
|
||||
|
||||
private static void readInts24(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
|
||||
int i;
|
||||
for (i = 0; i < count - 7; i += 8) {
|
||||
long l1 = in.readLong();
|
||||
long l2 = in.readLong();
|
||||
long l3 = in.readLong();
|
||||
visitor.visit((int) (l1 >>> 40));
|
||||
visitor.visit((int) (l1 >>> 16) & 0xffffff);
|
||||
visitor.visit((int) (((l1 & 0xffff) << 8) | (l2 >>> 56)));
|
||||
visitor.visit((int) (l2 >>> 32) & 0xffffff);
|
||||
visitor.visit((int) (l2 >>> 8) & 0xffffff);
|
||||
visitor.visit((int) (((l2 & 0xff) << 16) | (l3 >>> 48)));
|
||||
visitor.visit((int) (l3 >>> 24) & 0xffffff);
|
||||
visitor.visit((int) l3 & 0xffffff);
|
||||
}
|
||||
for (; i < count; ++i) {
|
||||
visitor.visit((Short.toUnsignedInt(in.readShort()) << 8) | Byte.toUnsignedInt(in.readByte()));
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,101 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.util.bkd;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.index.PointValues.IntersectVisitor;
|
||||
import org.apache.lucene.index.PointValues.Relation;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class TestDocIdsWriter extends LuceneTestCase {
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
try (Directory dir = newDirectory()) {
|
||||
for (int iter = 0; iter < 1000; ++iter) {
|
||||
int[] docIDs = new int[random().nextInt(5000)];
|
||||
final int bpv = TestUtil.nextInt(random(), 1, 32);
|
||||
for (int i = 0; i < docIDs.length; ++i) {
|
||||
docIDs[i] = TestUtil.nextInt(random(), 0, (1 << bpv) - 1);
|
||||
}
|
||||
test(dir, docIDs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testSorted() throws Exception {
|
||||
try (Directory dir = newDirectory()) {
|
||||
for (int iter = 0; iter < 1000; ++iter) {
|
||||
int[] docIDs = new int[random().nextInt(5000)];
|
||||
final int bpv = TestUtil.nextInt(random(), 1, 32);
|
||||
for (int i = 0; i < docIDs.length; ++i) {
|
||||
docIDs[i] = TestUtil.nextInt(random(), 0, (1 << bpv) - 1);
|
||||
}
|
||||
Arrays.sort(docIDs);
|
||||
test(dir, docIDs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void test(Directory dir, int[] ints) throws Exception {
|
||||
final long len;
|
||||
try(IndexOutput out = dir.createOutput("tmp", IOContext.DEFAULT)) {
|
||||
DocIdsWriter.writeDocIds(ints, 0, ints.length, out);
|
||||
len = out.getFilePointer();
|
||||
if (random().nextBoolean()) {
|
||||
out.writeLong(0); // garbage
|
||||
}
|
||||
}
|
||||
try (IndexInput in = dir.openInput("tmp", IOContext.READONCE)) {
|
||||
int[] read = new int[ints.length];
|
||||
DocIdsWriter.readInts(in, ints.length, read);
|
||||
assertArrayEquals(ints, read);
|
||||
assertEquals(len, in.getFilePointer());
|
||||
}
|
||||
try (IndexInput in = dir.openInput("tmp", IOContext.READONCE)) {
|
||||
int[] read = new int[ints.length];
|
||||
DocIdsWriter.readInts(in, ints.length, new IntersectVisitor() {
|
||||
int i = 0;
|
||||
@Override
|
||||
public void visit(int docID) throws IOException {
|
||||
read[i++] = docID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visit(int docID, byte[] packedValue) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
});
|
||||
assertArrayEquals(ints, read);
|
||||
assertEquals(len, in.getFilePointer());
|
||||
}
|
||||
dir.deleteFile("tmp");
|
||||
}
|
||||
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user