LUCENE-7351: Doc id compression for points.

This commit is contained in:
Adrien Grand 2016-07-04 09:13:41 +02:00
parent 4eaf66acb6
commit 01de73bc0a
5 changed files with 288 additions and 11 deletions

View File

@ -64,6 +64,8 @@ Optimizations
* LUCENE-7356: SearchGroup tweaks. (Christine Poerschke)
* LUCENE-7351: Doc id compression for points. (Adrien Grand)
Other
* LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien

View File

@ -44,11 +44,12 @@ public class BKDReader implements Accountable {
final byte[] maxPackedValue;
final long pointCount;
final int docCount;
final int version;
protected final int packedBytesLength;
/** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */
public BKDReader(IndexInput in) throws IOException {
CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_START);
version = CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT);
numDims = in.readVInt();
maxPointsInLeafNode = in.readVInt();
bytesPerDim = in.readVInt();
@ -141,6 +142,7 @@ public class BKDReader implements Accountable {
this.maxPackedValue = maxPackedValue;
this.pointCount = pointCount;
this.docCount = docCount;
this.version = BKDWriter.VERSION_CURRENT;
assert minPackedValue.length == packedBytesLength;
assert maxPackedValue.length == packedBytesLength;
}
@ -314,13 +316,15 @@ public class BKDReader implements Accountable {
protected void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
// Leaf node
in.seek(blockFP);
// How many points are stored in this leaf cell:
int count = in.readVInt();
visitor.grow(count);
for(int i=0;i<count;i++) {
visitor.visit(in.readInt());
if (version < BKDWriter.VERSION_COMPRESSED_DOC_IDS) {
DocIdsWriter.readInts32(in, count, visitor);
} else {
DocIdsWriter.readInts(in, count, visitor);
}
}
@ -330,8 +334,10 @@ public class BKDReader implements Accountable {
// How many points are stored in this leaf cell:
int count = in.readVInt();
for(int i=0;i<count;i++) {
docIDs[i] = in.readInt();
if (version < BKDWriter.VERSION_COMPRESSED_DOC_IDS) {
DocIdsWriter.readInts32(in, count, docIDs);
} else {
DocIdsWriter.readInts(in, count, docIDs);
}
return count;

View File

@ -77,7 +77,8 @@ public class BKDWriter implements Closeable {
public static final String CODEC_NAME = "BKD";
public static final int VERSION_START = 0;
public static final int VERSION_CURRENT = VERSION_START;
public static final int VERSION_COMPRESSED_DOC_IDS = 1;
public static final int VERSION_CURRENT = VERSION_COMPRESSED_DOC_IDS;
/** How many bytes each docs takes in the fixed-width offline format */
private final int bytesPerDoc;
@ -892,10 +893,7 @@ public class BKDWriter implements Closeable {
protected void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
assert count > 0: "maxPointsInLeafNode=" + maxPointsInLeafNode;
out.writeVInt(count);
for (int i=0;i<count;i++) {
out.writeInt(docIDs[start + i]);
}
DocIdsWriter.writeDocIds(docIDs, start, count, out);
}
protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes, int offset) throws IOException {

View File

@ -0,0 +1,170 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.bkd;
import java.io.IOException;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
class DocIdsWriter {
private DocIdsWriter() {}
static void writeDocIds(int[] docIds, int start, int count, IndexOutput out) throws IOException {
// docs can be sorted either when all docs in a block have the same value
// or when a segment is sorted
boolean sorted = true;
for (int i = 1; i < count; ++i) {
if (docIds[start + i - 1] > docIds[start + i]) {
sorted = false;
break;
}
}
if (sorted) {
out.writeByte((byte) 0);
int previous = 0;
for (int i = 0; i < count; ++i) {
int doc = docIds[start + i];
out.writeVInt(doc - previous);
previous = doc;
}
} else {
long max = 0;
for (int i = 0; i < count; ++i) {
max |= Integer.toUnsignedLong(docIds[start + i]);
}
if (max <= 0xffffff) {
out.writeByte((byte) 24);
for (int i = 0; i < count; ++i) {
out.writeShort((short) (docIds[start + i] >>> 8));
out.writeByte((byte) docIds[start + i]);
}
} else {
out.writeByte((byte) 32);
for (int i = 0; i < count; ++i) {
out.writeInt(docIds[start + i]);
}
}
}
}
/** Read {@code count} integers into {@code docIDs}. */
static void readInts(IndexInput in, int count, int[] docIDs) throws IOException {
final int bpv = in.readByte();
switch (bpv) {
case 0:
readDeltaVInts(in, count, docIDs);
break;
case 32:
readInts32(in, count, docIDs);
break;
case 24:
readInts24(in, count, docIDs);
break;
default:
throw new IOException("Unsupported number of bits per value: " + bpv);
}
}
private static void readDeltaVInts(IndexInput in, int count, int[] docIDs) throws IOException {
int doc = 0;
for (int i = 0; i < count; i++) {
doc += in.readVInt();
docIDs[i] = doc;
}
}
static <T> void readInts32(IndexInput in, int count, int[] docIDs) throws IOException {
for (int i = 0; i < count; i++) {
docIDs[i] = in.readInt();
}
}
private static void readInts24(IndexInput in, int count, int[] docIDs) throws IOException {
int i;
for (i = 0; i < count - 7; i += 8) {
long l1 = in.readLong();
long l2 = in.readLong();
long l3 = in.readLong();
docIDs[i] = (int) (l1 >>> 40);
docIDs[i+1] = (int) (l1 >>> 16) & 0xffffff;
docIDs[i+2] = (int) (((l1 & 0xffff) << 8) | (l2 >>> 56));
docIDs[i+3] = (int) (l2 >>> 32) & 0xffffff;
docIDs[i+4] = (int) (l2 >>> 8) & 0xffffff;
docIDs[i+5] = (int) (((l2 & 0xff) << 16) | (l3 >>> 48));
docIDs[i+6] = (int) (l3 >>> 24) & 0xffffff;
docIDs[i+7] = (int) l3 & 0xffffff;
}
for (; i < count; ++i) {
docIDs[i] = (Short.toUnsignedInt(in.readShort()) << 8) | Byte.toUnsignedInt(in.readByte());
}
}
/** Read {@code count} integers and feed the result directly to {@link IntersectVisitor#visit(int)}. */
static void readInts(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
final int bpv = in.readByte();
switch (bpv) {
case 0:
readDeltaVInts(in, count, visitor);
break;
case 32:
readInts32(in, count, visitor);
break;
case 24:
readInts24(in, count, visitor);
break;
default:
throw new IOException("Unsupported number of bits per value: " + bpv);
}
}
private static void readDeltaVInts(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
int doc = 0;
for (int i = 0; i < count; i++) {
doc += in.readVInt();
visitor.visit(doc);
}
}
static void readInts32(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
for (int i = 0; i < count; i++) {
visitor.visit(in.readInt());
}
}
private static void readInts24(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
int i;
for (i = 0; i < count - 7; i += 8) {
long l1 = in.readLong();
long l2 = in.readLong();
long l3 = in.readLong();
visitor.visit((int) (l1 >>> 40));
visitor.visit((int) (l1 >>> 16) & 0xffffff);
visitor.visit((int) (((l1 & 0xffff) << 8) | (l2 >>> 56)));
visitor.visit((int) (l2 >>> 32) & 0xffffff);
visitor.visit((int) (l2 >>> 8) & 0xffffff);
visitor.visit((int) (((l2 & 0xff) << 16) | (l3 >>> 48)));
visitor.visit((int) (l3 >>> 24) & 0xffffff);
visitor.visit((int) l3 & 0xffffff);
}
for (; i < count; ++i) {
visitor.visit((Short.toUnsignedInt(in.readShort()) << 8) | Byte.toUnsignedInt(in.readByte()));
}
}
}

View File

@ -0,0 +1,101 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.bkd;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestDocIdsWriter extends LuceneTestCase {
public void testRandom() throws Exception {
try (Directory dir = newDirectory()) {
for (int iter = 0; iter < 1000; ++iter) {
int[] docIDs = new int[random().nextInt(5000)];
final int bpv = TestUtil.nextInt(random(), 1, 32);
for (int i = 0; i < docIDs.length; ++i) {
docIDs[i] = TestUtil.nextInt(random(), 0, (1 << bpv) - 1);
}
test(dir, docIDs);
}
}
}
public void testSorted() throws Exception {
try (Directory dir = newDirectory()) {
for (int iter = 0; iter < 1000; ++iter) {
int[] docIDs = new int[random().nextInt(5000)];
final int bpv = TestUtil.nextInt(random(), 1, 32);
for (int i = 0; i < docIDs.length; ++i) {
docIDs[i] = TestUtil.nextInt(random(), 0, (1 << bpv) - 1);
}
Arrays.sort(docIDs);
test(dir, docIDs);
}
}
}
private void test(Directory dir, int[] ints) throws Exception {
final long len;
try(IndexOutput out = dir.createOutput("tmp", IOContext.DEFAULT)) {
DocIdsWriter.writeDocIds(ints, 0, ints.length, out);
len = out.getFilePointer();
if (random().nextBoolean()) {
out.writeLong(0); // garbage
}
}
try (IndexInput in = dir.openInput("tmp", IOContext.READONCE)) {
int[] read = new int[ints.length];
DocIdsWriter.readInts(in, ints.length, read);
assertArrayEquals(ints, read);
assertEquals(len, in.getFilePointer());
}
try (IndexInput in = dir.openInput("tmp", IOContext.READONCE)) {
int[] read = new int[ints.length];
DocIdsWriter.readInts(in, ints.length, new IntersectVisitor() {
int i = 0;
@Override
public void visit(int docID) throws IOException {
read[i++] = docID;
}
@Override
public void visit(int docID, byte[] packedValue) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
throw new UnsupportedOperationException();
}
});
assertArrayEquals(ints, read);
assertEquals(len, in.getFilePointer());
}
dir.deleteFile("tmp");
}
}