From 01de73bc0a1b315bbbe4df046b5c0661cec8de2e Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Mon, 4 Jul 2016 09:13:41 +0200 Subject: [PATCH] LUCENE-7351: Doc id compression for points. --- lucene/CHANGES.txt | 2 + .../org/apache/lucene/util/bkd/BKDReader.java | 18 +- .../org/apache/lucene/util/bkd/BKDWriter.java | 8 +- .../apache/lucene/util/bkd/DocIdsWriter.java | 170 ++++++++++++++++++ .../lucene/util/bkd/TestDocIdsWriter.java | 101 +++++++++++ 5 files changed, 288 insertions(+), 11 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java create mode 100644 lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 53e18a566c8..ce6e60b25aa 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -64,6 +64,8 @@ Optimizations * LUCENE-7356: SearchGroup tweaks. (Christine Poerschke) +* LUCENE-7351: Doc id compression for points. (Adrien Grand) + Other * LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java index 9a123a2fbea..3566bc11d5b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java @@ -44,11 +44,12 @@ public class BKDReader implements Accountable { final byte[] maxPackedValue; final long pointCount; final int docCount; + final int version; protected final int packedBytesLength; /** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */ public BKDReader(IndexInput in) throws IOException { - CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_START); + version = CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT); numDims = in.readVInt(); maxPointsInLeafNode = in.readVInt(); bytesPerDim = in.readVInt(); @@ -141,6 +142,7 @@ public class BKDReader implements Accountable { this.maxPackedValue = maxPackedValue; this.pointCount = pointCount; this.docCount = docCount; + this.version = BKDWriter.VERSION_CURRENT; assert minPackedValue.length == packedBytesLength; assert maxPackedValue.length == packedBytesLength; } @@ -314,13 +316,15 @@ public class BKDReader implements Accountable { protected void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException { // Leaf node in.seek(blockFP); - + // How many points are stored in this leaf cell: int count = in.readVInt(); visitor.grow(count); - for(int i=0;i 0: "maxPointsInLeafNode=" + maxPointsInLeafNode; out.writeVInt(count); - - for (int i=0;i docIds[start + i]) { + sorted = false; + break; + } + } + if (sorted) { + out.writeByte((byte) 0); + int previous = 0; + for (int i = 0; i < count; ++i) { + int doc = docIds[start + i]; + out.writeVInt(doc - previous); + previous = doc; + } + } else { + long max = 0; + for (int i = 0; i < count; ++i) { + max |= Integer.toUnsignedLong(docIds[start + i]); + } + if (max <= 0xffffff) { + out.writeByte((byte) 24); + for (int i = 0; i < count; ++i) { + out.writeShort((short) (docIds[start + i] >>> 8)); + out.writeByte((byte) docIds[start + i]); + } + } else { + out.writeByte((byte) 32); + for (int i = 0; i < count; ++i) { + out.writeInt(docIds[start + i]); + } + } + } + } + + /** Read {@code count} integers into {@code docIDs}. */ + static void readInts(IndexInput in, int count, int[] docIDs) throws IOException { + final int bpv = in.readByte(); + switch (bpv) { + case 0: + readDeltaVInts(in, count, docIDs); + break; + case 32: + readInts32(in, count, docIDs); + break; + case 24: + readInts24(in, count, docIDs); + break; + default: + throw new IOException("Unsupported number of bits per value: " + bpv); + } + } + + private static void readDeltaVInts(IndexInput in, int count, int[] docIDs) throws IOException { + int doc = 0; + for (int i = 0; i < count; i++) { + doc += in.readVInt(); + docIDs[i] = doc; + } + } + + static void readInts32(IndexInput in, int count, int[] docIDs) throws IOException { + for (int i = 0; i < count; i++) { + docIDs[i] = in.readInt(); + } + } + + private static void readInts24(IndexInput in, int count, int[] docIDs) throws IOException { + int i; + for (i = 0; i < count - 7; i += 8) { + long l1 = in.readLong(); + long l2 = in.readLong(); + long l3 = in.readLong(); + docIDs[i] = (int) (l1 >>> 40); + docIDs[i+1] = (int) (l1 >>> 16) & 0xffffff; + docIDs[i+2] = (int) (((l1 & 0xffff) << 8) | (l2 >>> 56)); + docIDs[i+3] = (int) (l2 >>> 32) & 0xffffff; + docIDs[i+4] = (int) (l2 >>> 8) & 0xffffff; + docIDs[i+5] = (int) (((l2 & 0xff) << 16) | (l3 >>> 48)); + docIDs[i+6] = (int) (l3 >>> 24) & 0xffffff; + docIDs[i+7] = (int) l3 & 0xffffff; + } + for (; i < count; ++i) { + docIDs[i] = (Short.toUnsignedInt(in.readShort()) << 8) | Byte.toUnsignedInt(in.readByte()); + } + } + + /** Read {@code count} integers and feed the result directly to {@link IntersectVisitor#visit(int)}. */ + static void readInts(IndexInput in, int count, IntersectVisitor visitor) throws IOException { + final int bpv = in.readByte(); + switch (bpv) { + case 0: + readDeltaVInts(in, count, visitor); + break; + case 32: + readInts32(in, count, visitor); + break; + case 24: + readInts24(in, count, visitor); + break; + default: + throw new IOException("Unsupported number of bits per value: " + bpv); + } + } + + private static void readDeltaVInts(IndexInput in, int count, IntersectVisitor visitor) throws IOException { + int doc = 0; + for (int i = 0; i < count; i++) { + doc += in.readVInt(); + visitor.visit(doc); + } + } + + static void readInts32(IndexInput in, int count, IntersectVisitor visitor) throws IOException { + for (int i = 0; i < count; i++) { + visitor.visit(in.readInt()); + } + } + + private static void readInts24(IndexInput in, int count, IntersectVisitor visitor) throws IOException { + int i; + for (i = 0; i < count - 7; i += 8) { + long l1 = in.readLong(); + long l2 = in.readLong(); + long l3 = in.readLong(); + visitor.visit((int) (l1 >>> 40)); + visitor.visit((int) (l1 >>> 16) & 0xffffff); + visitor.visit((int) (((l1 & 0xffff) << 8) | (l2 >>> 56))); + visitor.visit((int) (l2 >>> 32) & 0xffffff); + visitor.visit((int) (l2 >>> 8) & 0xffffff); + visitor.visit((int) (((l2 & 0xff) << 16) | (l3 >>> 48))); + visitor.visit((int) (l3 >>> 24) & 0xffffff); + visitor.visit((int) l3 & 0xffffff); + } + for (; i < count; ++i) { + visitor.visit((Short.toUnsignedInt(in.readShort()) << 8) | Byte.toUnsignedInt(in.readByte())); + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java new file mode 100644 index 00000000000..1191ba58c79 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.bkd; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.index.PointValues.IntersectVisitor; +import org.apache.lucene.index.PointValues.Relation; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +public class TestDocIdsWriter extends LuceneTestCase { + + public void testRandom() throws Exception { + try (Directory dir = newDirectory()) { + for (int iter = 0; iter < 1000; ++iter) { + int[] docIDs = new int[random().nextInt(5000)]; + final int bpv = TestUtil.nextInt(random(), 1, 32); + for (int i = 0; i < docIDs.length; ++i) { + docIDs[i] = TestUtil.nextInt(random(), 0, (1 << bpv) - 1); + } + test(dir, docIDs); + } + } + } + + public void testSorted() throws Exception { + try (Directory dir = newDirectory()) { + for (int iter = 0; iter < 1000; ++iter) { + int[] docIDs = new int[random().nextInt(5000)]; + final int bpv = TestUtil.nextInt(random(), 1, 32); + for (int i = 0; i < docIDs.length; ++i) { + docIDs[i] = TestUtil.nextInt(random(), 0, (1 << bpv) - 1); + } + Arrays.sort(docIDs); + test(dir, docIDs); + } + } + } + + private void test(Directory dir, int[] ints) throws Exception { + final long len; + try(IndexOutput out = dir.createOutput("tmp", IOContext.DEFAULT)) { + DocIdsWriter.writeDocIds(ints, 0, ints.length, out); + len = out.getFilePointer(); + if (random().nextBoolean()) { + out.writeLong(0); // garbage + } + } + try (IndexInput in = dir.openInput("tmp", IOContext.READONCE)) { + int[] read = new int[ints.length]; + DocIdsWriter.readInts(in, ints.length, read); + assertArrayEquals(ints, read); + assertEquals(len, in.getFilePointer()); + } + try (IndexInput in = dir.openInput("tmp", IOContext.READONCE)) { + int[] read = new int[ints.length]; + DocIdsWriter.readInts(in, ints.length, new IntersectVisitor() { + int i = 0; + @Override + public void visit(int docID) throws IOException { + read[i++] = docID; + } + + @Override + public void visit(int docID, byte[] packedValue) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + throw new UnsupportedOperationException(); + } + + }); + assertArrayEquals(ints, read); + assertEquals(len, in.getFilePointer()); + } + dir.deleteFile("tmp"); + } + +}