From f45358ebc198691acd66cbe269380a1a1a544f08 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 10 Jul 2013 19:24:24 +0000 Subject: [PATCH] LUCENE-5081: WAH8DocIdSet. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1501925 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 + .../CompressingStoredFieldsIndexWriter.java | 1 - .../CompressingStoredFieldsWriter.java | 1 + .../CompressingTermVectorsWriter.java | 1 + .../lucene/codecs/lucene40/BitVector.java | 30 +- .../java/org/apache/lucene/util/BitUtil.java | 85 +++ .../GrowableByteArrayDataOutput.java | 21 +- .../lucene/util/OpenBitSetIterator.java | 51 +- .../org/apache/lucene/util/WAH8DocIdSet.java | 625 ++++++++++++++++++ .../apache/lucene/util/TestWAH8DocIdSet.java | 144 ++++ 10 files changed, 878 insertions(+), 84 deletions(-) rename lucene/core/src/java/org/apache/lucene/{codecs/compressing => util}/GrowableByteArrayDataOutput.java (75%) create mode 100644 lucene/core/src/java/org/apache/lucene/util/WAH8DocIdSet.java create mode 100644 lucene/core/src/test/org/apache/lucene/util/TestWAH8DocIdSet.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 39ae48e02ad..6a2ce7ccdce 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -51,6 +51,9 @@ New features * LUCENE-5084: Added new Elias-Fano encoder, decoder and DocIdSet implementations. (Paul Elschot via Adrien Grand) +* LUCENE-5081: Added WAH8DocIdSet, an in-memory doc id set implementation based + on word-aligned hybrid encoding. (Adrien Grand) + ======================= Lucene 4.4.0 ======================= Changes in backwards compatibility policy diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java index 113563f0726..2367d9e9c83 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java @@ -21,7 +21,6 @@ import java.io.Closeable; import java.io.IOException; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.packed.PackedInts; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java index 865267a5b34..296283a665e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java @@ -44,6 +44,7 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.GrowableByteArrayDataOutput; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.packed.PackedInts; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java index 56d75192fb4..1fcd7a6b9dd 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java @@ -45,6 +45,7 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.GrowableByteArrayDataOutput; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.packed.BlockPackedWriter; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/BitVector.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/BitVector.java index cd5f174d905..2de7a2be3ad 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/BitVector.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/BitVector.java @@ -26,6 +26,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.MutableBits; @@ -166,7 +167,7 @@ final class BitVector implements Cloneable, MutableBits { int c = 0; int end = bits.length; for (int i = 0; i < end; i++) { - c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte + c += BitUtil.bitCount(bits[i]); // sum bits per byte } count = c; } @@ -179,29 +180,12 @@ final class BitVector implements Cloneable, MutableBits { int c = 0; int end = bits.length; for (int i = 0; i < end; i++) { - c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte + c += BitUtil.bitCount(bits[i]); // sum bits per byte } return c; } - private static final byte[] BYTE_COUNTS = { // table of bits/byte - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 - }; + private static String CODEC = "BitVector"; @@ -294,7 +278,7 @@ final class BitVector implements Cloneable, MutableBits { output.writeVInt(i-last); output.writeByte(bits[i]); last = i; - numCleared -= (8-BYTE_COUNTS[bits[i] & 0xFF]); + numCleared -= (8-BitUtil.bitCount(bits[i])); assert numCleared >= 0 || (i == (bits.length-1) && numCleared == -(8-(size&7))); } } @@ -399,7 +383,7 @@ final class BitVector implements Cloneable, MutableBits { while (n>0) { last += input.readVInt(); bits[last] = input.readByte(); - n -= BYTE_COUNTS[bits[last] & 0xFF]; + n -= BitUtil.bitCount(bits[last]); assert n >= 0; } } @@ -416,7 +400,7 @@ final class BitVector implements Cloneable, MutableBits { while (numCleared>0) { last += input.readVInt(); bits[last] = input.readByte(); - numCleared -= 8-BYTE_COUNTS[bits[last] & 0xFF]; + numCleared -= 8-BitUtil.bitCount(bits[last]); assert numCleared >= 0 || (last == (bits.length-1) && numCleared == -(8-(size&7))); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/BitUtil.java b/lucene/core/src/java/org/apache/lucene/util/BitUtil.java index e606f39f719..3cb8bf4285a 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BitUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/BitUtil.java @@ -22,8 +22,93 @@ package org.apache.lucene.util; // from org.apache.solr.util rev 555343 */ public final class BitUtil { + private static final byte[] BYTE_COUNTS = { // table of bits/byte + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 + }; + + // The General Idea: instead of having an array per byte that has + // the offsets of the next set bit, that array could be + // packed inside a 32 bit integer (8 4 bit numbers). That + // should be faster than accessing an array for each index, and + // the total array size is kept smaller (256*sizeof(int))=1K + /***** the python code that generated bitlist + def bits2int(val): + arr=0 + for shift in range(8,0,-1): + if val & 0x80: + arr = (arr << 4) | shift + val = val << 1 + return arr + + def int_table(): + tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ] + return ','.join(tbl) + ******/ + private static final int[] BIT_LISTS = { + 0x0, 0x1, 0x2, 0x21, 0x3, 0x31, 0x32, 0x321, 0x4, 0x41, 0x42, 0x421, 0x43, + 0x431, 0x432, 0x4321, 0x5, 0x51, 0x52, 0x521, 0x53, 0x531, 0x532, 0x5321, + 0x54, 0x541, 0x542, 0x5421, 0x543, 0x5431, 0x5432, 0x54321, 0x6, 0x61, 0x62, + 0x621, 0x63, 0x631, 0x632, 0x6321, 0x64, 0x641, 0x642, 0x6421, 0x643, + 0x6431, 0x6432, 0x64321, 0x65, 0x651, 0x652, 0x6521, 0x653, 0x6531, 0x6532, + 0x65321, 0x654, 0x6541, 0x6542, 0x65421, 0x6543, 0x65431, 0x65432, 0x654321, + 0x7, 0x71, 0x72, 0x721, 0x73, 0x731, 0x732, 0x7321, 0x74, 0x741, 0x742, + 0x7421, 0x743, 0x7431, 0x7432, 0x74321, 0x75, 0x751, 0x752, 0x7521, 0x753, + 0x7531, 0x7532, 0x75321, 0x754, 0x7541, 0x7542, 0x75421, 0x7543, 0x75431, + 0x75432, 0x754321, 0x76, 0x761, 0x762, 0x7621, 0x763, 0x7631, 0x7632, + 0x76321, 0x764, 0x7641, 0x7642, 0x76421, 0x7643, 0x76431, 0x76432, 0x764321, + 0x765, 0x7651, 0x7652, 0x76521, 0x7653, 0x76531, 0x76532, 0x765321, 0x7654, + 0x76541, 0x76542, 0x765421, 0x76543, 0x765431, 0x765432, 0x7654321, 0x8, + 0x81, 0x82, 0x821, 0x83, 0x831, 0x832, 0x8321, 0x84, 0x841, 0x842, 0x8421, + 0x843, 0x8431, 0x8432, 0x84321, 0x85, 0x851, 0x852, 0x8521, 0x853, 0x8531, + 0x8532, 0x85321, 0x854, 0x8541, 0x8542, 0x85421, 0x8543, 0x85431, 0x85432, + 0x854321, 0x86, 0x861, 0x862, 0x8621, 0x863, 0x8631, 0x8632, 0x86321, 0x864, + 0x8641, 0x8642, 0x86421, 0x8643, 0x86431, 0x86432, 0x864321, 0x865, 0x8651, + 0x8652, 0x86521, 0x8653, 0x86531, 0x86532, 0x865321, 0x8654, 0x86541, + 0x86542, 0x865421, 0x86543, 0x865431, 0x865432, 0x8654321, 0x87, 0x871, + 0x872, 0x8721, 0x873, 0x8731, 0x8732, 0x87321, 0x874, 0x8741, 0x8742, + 0x87421, 0x8743, 0x87431, 0x87432, 0x874321, 0x875, 0x8751, 0x8752, 0x87521, + 0x8753, 0x87531, 0x87532, 0x875321, 0x8754, 0x87541, 0x87542, 0x875421, + 0x87543, 0x875431, 0x875432, 0x8754321, 0x876, 0x8761, 0x8762, 0x87621, + 0x8763, 0x87631, 0x87632, 0x876321, 0x8764, 0x87641, 0x87642, 0x876421, + 0x87643, 0x876431, 0x876432, 0x8764321, 0x8765, 0x87651, 0x87652, 0x876521, + 0x87653, 0x876531, 0x876532, 0x8765321, 0x87654, 0x876541, 0x876542, + 0x8765421, 0x876543, 0x8765431, 0x8765432, 0x87654321 + }; + private BitUtil() {} // no instance + /** Return the number of bits sets in b. */ + public static int bitCount(byte b) { + return BYTE_COUNTS[b & 0xFF]; + } + + /** Return the list of bits which are set in b encoded as followed: + * (i >>> (4 * n)) & 0x0F is the offset of the n-th set bit of + * the given byte plus one, or 0 if there are n or less bits set in the given + * byte. For example bitList(12) returns 0x43:*/ + public static int bitList(byte b) { + return BIT_LISTS[b & 0xFF]; + } + // The pop methods used to rely on bit-manipulation tricks for speed but it // turns out that it is faster to use the Long.bitCount method (which is an // intrinsic since Java 6u18) in a naive loop, see LUCENE-2221 diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java similarity index 75% rename from lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java rename to lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java index f5aeefb3de3..18583ba5196 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.compressing; +package org.apache.lucene.util; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,26 +17,27 @@ package org.apache.lucene.codecs.compressing; * limitations under the License. */ -import java.io.IOException; - import org.apache.lucene.store.DataOutput; -import org.apache.lucene.util.ArrayUtil; /** * A {@link DataOutput} that can be used to build a byte[]. + * @lucene.internal */ -final class GrowableByteArrayDataOutput extends DataOutput { +public final class GrowableByteArrayDataOutput extends DataOutput { - byte[] bytes; - int length; + /** The bytes */ + public byte[] bytes; + /** The length */ + public int length; - GrowableByteArrayDataOutput(int cp) { + /** Create a {@link GrowableByteArrayDataOutput} with the given initial capacity. */ + public GrowableByteArrayDataOutput(int cp) { this.bytes = new byte[ArrayUtil.oversize(cp, 1)]; this.length = 0; } @Override - public void writeByte(byte b) throws IOException { + public void writeByte(byte b) { if (length >= bytes.length) { bytes = ArrayUtil.grow(bytes); } @@ -44,7 +45,7 @@ final class GrowableByteArrayDataOutput extends DataOutput { } @Override - public void writeBytes(byte[] b, int off, int len) throws IOException { + public void writeBytes(byte[] b, int off, int len) { final int newLength = length + len; if (newLength > bytes.length) { bytes = ArrayUtil.grow(bytes, newLength); diff --git a/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java b/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java index ceadce3111c..699e7315a49 100644 --- a/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java +++ b/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java @@ -25,55 +25,6 @@ import org.apache.lucene.search.DocIdSetIterator; */ public class OpenBitSetIterator extends DocIdSetIterator { - // The General Idea: instead of having an array per byte that has - // the offsets of the next set bit, that array could be - // packed inside a 32 bit integer (8 4 bit numbers). That - // should be faster than accessing an array for each index, and - // the total array size is kept smaller (256*sizeof(int))=1K - protected final static int[] bitlist={ - 0x0, 0x1, 0x2, 0x21, 0x3, 0x31, 0x32, 0x321, 0x4, 0x41, 0x42, 0x421, 0x43, - 0x431, 0x432, 0x4321, 0x5, 0x51, 0x52, 0x521, 0x53, 0x531, 0x532, 0x5321, - 0x54, 0x541, 0x542, 0x5421, 0x543, 0x5431, 0x5432, 0x54321, 0x6, 0x61, 0x62, - 0x621, 0x63, 0x631, 0x632, 0x6321, 0x64, 0x641, 0x642, 0x6421, 0x643, - 0x6431, 0x6432, 0x64321, 0x65, 0x651, 0x652, 0x6521, 0x653, 0x6531, 0x6532, - 0x65321, 0x654, 0x6541, 0x6542, 0x65421, 0x6543, 0x65431, 0x65432, 0x654321, - 0x7, 0x71, 0x72, 0x721, 0x73, 0x731, 0x732, 0x7321, 0x74, 0x741, 0x742, - 0x7421, 0x743, 0x7431, 0x7432, 0x74321, 0x75, 0x751, 0x752, 0x7521, 0x753, - 0x7531, 0x7532, 0x75321, 0x754, 0x7541, 0x7542, 0x75421, 0x7543, 0x75431, - 0x75432, 0x754321, 0x76, 0x761, 0x762, 0x7621, 0x763, 0x7631, 0x7632, - 0x76321, 0x764, 0x7641, 0x7642, 0x76421, 0x7643, 0x76431, 0x76432, 0x764321, - 0x765, 0x7651, 0x7652, 0x76521, 0x7653, 0x76531, 0x76532, 0x765321, 0x7654, - 0x76541, 0x76542, 0x765421, 0x76543, 0x765431, 0x765432, 0x7654321, 0x8, - 0x81, 0x82, 0x821, 0x83, 0x831, 0x832, 0x8321, 0x84, 0x841, 0x842, 0x8421, - 0x843, 0x8431, 0x8432, 0x84321, 0x85, 0x851, 0x852, 0x8521, 0x853, 0x8531, - 0x8532, 0x85321, 0x854, 0x8541, 0x8542, 0x85421, 0x8543, 0x85431, 0x85432, - 0x854321, 0x86, 0x861, 0x862, 0x8621, 0x863, 0x8631, 0x8632, 0x86321, 0x864, - 0x8641, 0x8642, 0x86421, 0x8643, 0x86431, 0x86432, 0x864321, 0x865, 0x8651, - 0x8652, 0x86521, 0x8653, 0x86531, 0x86532, 0x865321, 0x8654, 0x86541, - 0x86542, 0x865421, 0x86543, 0x865431, 0x865432, 0x8654321, 0x87, 0x871, - 0x872, 0x8721, 0x873, 0x8731, 0x8732, 0x87321, 0x874, 0x8741, 0x8742, - 0x87421, 0x8743, 0x87431, 0x87432, 0x874321, 0x875, 0x8751, 0x8752, 0x87521, - 0x8753, 0x87531, 0x87532, 0x875321, 0x8754, 0x87541, 0x87542, 0x875421, - 0x87543, 0x875431, 0x875432, 0x8754321, 0x876, 0x8761, 0x8762, 0x87621, - 0x8763, 0x87631, 0x87632, 0x876321, 0x8764, 0x87641, 0x87642, 0x876421, - 0x87643, 0x876431, 0x876432, 0x8764321, 0x8765, 0x87651, 0x87652, 0x876521, - 0x87653, 0x876531, 0x876532, 0x8765321, 0x87654, 0x876541, 0x876542, - 0x8765421, 0x876543, 0x8765431, 0x8765432, 0x87654321 - }; - /***** the python code that generated bitlist - def bits2int(val): - arr=0 - for shift in range(8,0,-1): - if val & 0x80: - arr = (arr << 4) | shift - val = val << 1 - return arr - - def int_table(): - tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ] - return ','.join(tbl) - ******/ - // hmmm, what about an iterator that finds zeros though, // or a reverse iterator... should they be separate classes // for efficiency, or have a common root interface? (or @@ -101,7 +52,7 @@ public class OpenBitSetIterator extends DocIdSetIterator { if ((int)word ==0) {wordShift +=32; word = word >>>32; } if ((word & 0x0000FFFF) == 0) { wordShift +=16; word >>>=16; } if ((word & 0x000000FF) == 0) { wordShift +=8; word >>>=8; } - indexArray = bitlist[(int)word & 0xff]; + indexArray = BitUtil.bitList((byte) word); } /***** alternate shift implementations diff --git a/lucene/core/src/java/org/apache/lucene/util/WAH8DocIdSet.java b/lucene/core/src/java/org/apache/lucene/util/WAH8DocIdSet.java new file mode 100644 index 00000000000..dfdcecdfe72 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/WAH8DocIdSet.java @@ -0,0 +1,625 @@ +package org.apache.lucene.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; + +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.util.packed.PackedInts; + +/** + * {@link DocIdSet} implementation based on word-aligned hybrid encoding on + * words of 8 bits. + *

This implementation doesn't support random-access but has a fast + * {@link DocIdSetIterator} which can advance in logarithmic time thanks to + * an index.

+ *

The compression scheme is simplistic and should work well with sparse doc + * id sets while being only slightly larger than a {@link FixedBitSet} for + * incompressible sets (overhead<2% in the worst case) in spite of the index.

+ *

Format: The format is byte-aligned. An 8-bits word is either clean, + * meaning composed only of zeros, or dirty, meaning that it contains at least one + * bit set. The idea is to encode sequences of clean words using run-length + * encoding and to leave sequences of dirty words as-is.

+ * + * + * + *
TokenClean length+Dirty length+Dirty words
1 byte0-n bytes0-n bytes0-n bytes
+ * + *

This format cannot encode sequences of less than 2 clean words and 1 dirty + * word. The reason is that if you find a single clean word, you should rather + * encode it as a dirty word. This takes the same space as starting a new + * sequence (since you need one byte for the token) but will be lighter to + * decode. There is however an exception for the first sequence. Since the first + * sequence may start directly with a dirty word, the clean length is encoded + * directly, without subtracting 2.

+ *

There is an additional restriction on the format: the sequence of dirty + * words must start and end with a non-null word and is not allowed to contain + * two consecutive null words. This restriction exists to make sure no space is + * wasted and to make sure iterators can read the next doc ID by reading at most + * 2 dirty words.

+ * @lucene.experimental + */ +public final class WAH8DocIdSet extends DocIdSet { + + // Minimum index interval, intervals below this value can't guarantee anymore + // that this set implementation won't be significantly larger than a FixedBitSet + // The reason is that a single sequence saves at least one byte and an index + // entry requires at most 8 bytes (2 ints) so there shouldn't be more than one + // index entry every 8 sequences + private static final int MIN_INDEX_INTERVAL = 8; + + /** Default index interval. */ + // To compute this default value, I created a rather dense set (0.1% load + // factor, which is close to the worst case regarding both compression and + // speed for this DocIdSet impl since sequences are going to be short) and I + // started with interval=1 and doubled it at each iteration until advance + // became slower + public static final int DEFAULT_INDEX_INTERVAL = 16; + + private static final PackedInts.Reader EMPTY_READER = new PackedInts.NullReader(1); + private static WAH8DocIdSet EMPTY = new WAH8DocIdSet(new byte[0], EMPTY_READER, EMPTY_READER); + + private static final Comparator SERIALIZED_LENGTH_COMPARATOR = new Comparator() { + @Override + public int compare(Iterator wi1, Iterator wi2) { + return wi1.in.length() - wi2.in.length(); + } + }; + + /** Same as {@link #copyOf(DocIdSetIterator, int)} with the default index interval. */ + public static WAH8DocIdSet copyOf(DocIdSetIterator it) throws IOException { + return copyOf(it, DEFAULT_INDEX_INTERVAL); + } + + /** Return a copy of the provided iterator. */ + public static WAH8DocIdSet copyOf(DocIdSetIterator it, int indexInterval) throws IOException { + Builder builder = new Builder().setIndexInterval(indexInterval); + for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { + builder.add(doc); + } + return builder.build(); + } + + /** Same as {@link #intersect(Collection, int)} with the default index interval. */ + public static WAH8DocIdSet intersect(Collection docIdSets) { + return intersect(docIdSets, DEFAULT_INDEX_INTERVAL); + } + + /** + * Compute the intersection of the provided sets. This method is much faster than + * computing the intersection manually since it operates directly at the byte level. + */ + public static WAH8DocIdSet intersect(Collection docIdSets, int indexInterval) { + switch (docIdSets.size()) { + case 0: + throw new IllegalArgumentException("There must be at least one set to intersect"); + case 1: + return docIdSets.iterator().next(); + } + // The logic below is similar to ConjunctionScorer + final int numSets = docIdSets.size(); + final Iterator[] iterators = new Iterator[numSets]; + int i = 0; + for (WAH8DocIdSet set : docIdSets) { + final Iterator it = set.iterator(); + iterators[i++] = it; + } + Arrays.sort(iterators, SERIALIZED_LENGTH_COMPARATOR); + final WordBuilder builder = new WordBuilder().setIndexInterval(indexInterval); + int wordNum = 0; + main: + while (true) { + // Advance the least costly iterator first + iterators[0].advanceWord(wordNum); + wordNum = iterators[0].wordNum; + if (wordNum == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + byte word = iterators[0].word; + for (i = 1; i < numSets; ++i) { + if (iterators[i].wordNum < wordNum) { + iterators[i].advanceWord(wordNum); + } + if (iterators[i].wordNum > wordNum) { + wordNum = iterators[i].wordNum; + continue main; + } + assert iterators[i].wordNum == wordNum; + word &= iterators[i].word; + if (word == 0) { + // There are common words, but they don't share any bit + ++wordNum; + continue main; + } + } + // Found a common word + assert word != 0; + builder.addWord(wordNum, word); + ++wordNum; + } + return builder.build(); + } + + /** Same as {@link #union(Collection, int)} with the default index interval. */ + public static WAH8DocIdSet union(Collection docIdSets) { + return union(docIdSets, DEFAULT_INDEX_INTERVAL); + } + + /** + * Compute the union of the provided sets. This method is much faster than + * computing the union manually since it operates directly at the byte level. + */ + public static WAH8DocIdSet union(Collection docIdSets, int indexInterval) { + switch (docIdSets.size()) { + case 0: + return EMPTY; + case 1: + return docIdSets.iterator().next(); + } + // The logic below is very similar to DisjunctionScorer + final int numSets = docIdSets.size(); + final PriorityQueue iterators = new PriorityQueue(numSets) { + @Override + protected boolean lessThan(Iterator a, Iterator b) { + return a.wordNum < b.wordNum; + } + }; + for (WAH8DocIdSet set : docIdSets) { + Iterator iterator = set.iterator(); + iterator.nextWord(); + iterators.add(iterator); + } + + Iterator top = iterators.top(); + if (top.wordNum == Integer.MAX_VALUE) { + return EMPTY; + } + int wordNum = top.wordNum; + byte word = top.word; + final WordBuilder builder = new WordBuilder().setIndexInterval(indexInterval); + while (true) { + top.nextWord(); + iterators.updateTop(); + top = iterators.top(); + if (top.wordNum == wordNum) { + word |= top.word; + } else { + builder.addWord(wordNum, word); + if (top.wordNum == Integer.MAX_VALUE) { + break; + } + wordNum = top.wordNum; + word = top.word; + } + } + return builder.build(); + } + + static int wordNum(int docID) { + assert docID >= 0; + return docID >>> 3; + } + + /** Word-based builder. */ + static class WordBuilder { + + final GrowableByteArrayDataOutput out; + final GrowableByteArrayDataOutput dirtyWords; + int clean; + int lastWordNum; + int numSequences; + int indexInterval; + + WordBuilder() { + out = new GrowableByteArrayDataOutput(1024); + dirtyWords = new GrowableByteArrayDataOutput(128); + clean = 0; + lastWordNum = -1; + numSequences = 0; + indexInterval = DEFAULT_INDEX_INTERVAL; + } + + /** Set the index interval. Smaller index intervals improve performance of + * {@link DocIdSetIterator#advance(int)} but make the {@link DocIdSet} + * larger. An index interval i makes the index add an overhead + * which is at most 4/i, but likely much less.The default index + * interval is 16, meaning the index has an overhead of at most + * 25%. To disable indexing, you can pass {@link Integer#MAX_VALUE} as an + * index interval. */ + public WordBuilder setIndexInterval(int indexInterval) { + if (indexInterval < MIN_INDEX_INTERVAL) { + throw new IllegalArgumentException("indexInterval must be >= " + MIN_INDEX_INTERVAL); + } + this.indexInterval = indexInterval; + return this; + } + + void writeHeader(int cleanLength) throws IOException { + final int cleanLengthMinus2 = cleanLength - 2; + final int dirtyLengthMinus1 = dirtyWords.length - 1; + assert cleanLengthMinus2 >= 0; + assert dirtyLengthMinus1 >= 0; + int token = ((cleanLengthMinus2 & 0x07) << 4) | (dirtyLengthMinus1 & 0x07); + if (cleanLengthMinus2 > 0x07) { + token |= 1 << 7; + } + if (dirtyLengthMinus1 > 0x07) { + token |= 1 << 3; + } + out.writeByte((byte) token); + if (cleanLengthMinus2 > 0x07) { + out.writeVInt(cleanLengthMinus2 >>> 3); + } + if (dirtyLengthMinus1 > 0x07) { + out.writeVInt(dirtyLengthMinus1 >>> 3); + } + } + + void writeSequence(int cleanLength) { + try { + writeHeader(cleanLength); + out.writeBytes(dirtyWords.bytes, dirtyWords.length); + } catch (IOException cannotHappen) { + throw new AssertionError(cannotHappen); + } + dirtyWords.length = 0; + ++numSequences; + } + + void addWord(int wordNum, byte word) { + assert wordNum > lastWordNum; + assert word != 0; + + if (lastWordNum == -1) { + clean = 2 + wordNum; // special case for the 1st sequence + dirtyWords.writeByte(word); + } else { + switch (wordNum - lastWordNum) { + case 1: + dirtyWords.writeByte(word); + break; + case 2: + dirtyWords.writeByte((byte) 0); + dirtyWords.writeByte(word); + break; + default: + writeSequence(clean); + clean = wordNum - lastWordNum - 1; + dirtyWords.writeByte(word); + } + } + lastWordNum = wordNum; + } + + /** Build a new {@link WAH8DocIdSet}. */ + public WAH8DocIdSet build() { + if (lastWordNum == -1) { + return EMPTY; + } + writeSequence(clean); + final byte[] data = Arrays.copyOf(out.bytes, out.length); + + // Now build the index + final int valueCount = (numSequences - 1) / indexInterval + 1; + final PackedInts.Reader indexPositions; + final PackedInts.Reader indexWordNums; + if (valueCount <= 1) { + indexPositions = indexWordNums = EMPTY_READER; + } else { + // From the tests I ran, there is no need to expose acceptableOverheadRatio, these packed ints are never the bottleneck + final PackedInts.Mutable positions = PackedInts.getMutable(valueCount, PackedInts.bitsRequired(data.length - 1), PackedInts.COMPACT); + final PackedInts.Mutable wordNums = PackedInts.getMutable(valueCount, PackedInts.bitsRequired(lastWordNum), PackedInts.COMPACT); + + final Iterator it = new Iterator(data, null, null); + assert it.in.getPosition() == 0; + assert it.wordNum == -1; + for (int i = 1; i < valueCount; ++i) { + // skip indexInterval sequences + for (int j = 0; j < indexInterval; ++j) { + final boolean readSequence = it.readSequence(); + assert readSequence; + it.skipDirtyBytes(); + } + final int position = it.in.getPosition(); + final int wordNum = it.wordNum; + positions.set(i, position); + wordNums.set(i, wordNum + 1); + } + indexPositions = positions; + indexWordNums = wordNums; + } + + return new WAH8DocIdSet(data, indexPositions, indexWordNums); + } + + } + + /** A builder for {@link WAH8DocIdSet}s. */ + public static final class Builder extends WordBuilder { + + private int lastDocID; + private int wordNum, word; + + /** Sole constructor */ + public Builder() { + super(); + lastDocID = -1; + wordNum = -1; + word = 0; + } + + /** Add a document to this builder. Documents must be added in order. */ + public Builder add(int docID) { + if (docID <= lastDocID) { + throw new IllegalArgumentException("Doc ids must be added in-order, got " + docID + " which is <= lastDocID=" + lastDocID); + } + final int wordNum = wordNum(docID); + if (this.wordNum == -1) { + this.wordNum = wordNum; + word = 1 << (docID & 0x07); + } else if (wordNum == this.wordNum) { + word |= 1 << (docID & 0x07); + } else { + addWord(this.wordNum, (byte) word); + this.wordNum = wordNum; + word = 1 << (docID & 0x07); + } + lastDocID = docID; + return this; + } + + @Override + public Builder setIndexInterval(int indexInterval) { + return (Builder) super.setIndexInterval(indexInterval); + } + + @Override + public WAH8DocIdSet build() { + if (this.wordNum != -1) { + addWord(wordNum, (byte) word); + } + return super.build(); + } + + } + + // where the doc IDs are stored + private final byte[] data; + // index for advance(int) + private final PackedInts.Reader positions, wordNums; // wordNums[i] starts at the sequence at positions[i] + + WAH8DocIdSet(byte[] data, PackedInts.Reader positions, PackedInts.Reader wordNums) { + this.data = data; + this.positions = positions; + this.wordNums = wordNums; + } + + @Override + public boolean isCacheable() { + return true; + } + + @Override + public Iterator iterator() { + return new Iterator(data, positions, wordNums); + } + + static int readLength(ByteArrayDataInput in, int len) { + if ((len & 0x08) == 0) { + // no continuation bit + return len; + } + return (len & 0x07) | (in.readVInt() << 3); + } + + static class Iterator extends DocIdSetIterator { + + final ByteArrayDataInput in; + final PackedInts.Reader positions, wordNums; + int dirtyLength; + + int wordNum; // byte offset + byte word; // current word + int bitList; // list of bits set in the current word + + int docID; + + Iterator(byte[] data, PackedInts.Reader positions, PackedInts.Reader wordNums) { + this.in = new ByteArrayDataInput(data); + this.positions = positions; + this.wordNums = wordNums; + wordNum = -1; + word = 0; + bitList = 0; + docID = -1; + } + + boolean readSequence() { + if (in.eof()) { + wordNum = Integer.MAX_VALUE; + return false; + } + final int token = in.readByte() & 0xFF; + final int cleanLength = (in.getPosition() == 1 ? 0 : 2) + readLength(in, token >>> 4); + wordNum += cleanLength; + dirtyLength = 1 + readLength(in, token & 0x0F); + return true; + } + + void skipDirtyBytes(int count) { + assert count >= 0; + assert count <= dirtyLength; + in.skipBytes(count); + wordNum += count; + dirtyLength -= count; + } + + void skipDirtyBytes() { + in.skipBytes(dirtyLength); + wordNum += dirtyLength; + dirtyLength = 0; + } + + void nextWord() { + if (dirtyLength == 0 && !readSequence()) { + return; + } + word = in.readByte(); + if (word == 0) { + word = in.readByte(); + assert word != 0; // there can never be two consecutive null dirty words + ++wordNum; + --dirtyLength; + } + ++wordNum; + --dirtyLength; + } + + int binarySearch(int targetWordNum) { + int lo = 0, hi = positions.size() - 1; + while (lo <= hi) { + final int mid = (lo + hi) >>> 1; + final int midWordNum = (int) wordNums.get(mid); + if (midWordNum <= targetWordNum) { + lo = mid + 1; + } else { + hi = mid - 1; + } + } + assert wordNums.get(hi) <= targetWordNum; + assert hi+1 == wordNums.size() || wordNums.get(hi + 1) > targetWordNum; + return hi; + } + + void advanceWord(int targetWordNum) { + assert targetWordNum > wordNum; + if (dirtyLength == 0 && !readSequence()) { + return; + } + int delta = targetWordNum - wordNum; + if (delta <= dirtyLength + 1) { + if (delta > 1) { + skipDirtyBytes(delta - 1); + } + } else { + skipDirtyBytes(); + assert dirtyLength == 0; + // use the index + final int i = binarySearch(targetWordNum); + final int position = (int) positions.get(i); + if (position > in.getPosition()) { // if the binary search returned a backward offset, don't move + wordNum = (int) wordNums.get(i) - 1; + in.setPosition(position); + } + + while (true) { + if (!readSequence()) { + return; + } + delta = targetWordNum - wordNum; + if (delta <= dirtyLength + 1) { + if (delta > 1) { + skipDirtyBytes(delta - 1); + } + break; + } + skipDirtyBytes(); + } + } + + nextWord(); + } + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + if (bitList != 0) { // there are remaining bits in the current word + docID = (wordNum << 3) | ((bitList & 0x0F) - 1); + bitList >>>= 4; + return docID; + } + nextWord(); + if (wordNum == Integer.MAX_VALUE) { + return docID = NO_MORE_DOCS; + } + bitList = BitUtil.bitList(word); + assert bitList != 0; + docID = (wordNum << 3) | ((bitList & 0x0F) - 1); + bitList >>>= 4; + return docID; + } + + @Override + public int advance(int target) throws IOException { + assert target > docID; + final int targetWordNum = wordNum(target); + if (targetWordNum > wordNum) { + advanceWord(targetWordNum); + bitList = BitUtil.bitList(word); + } + return slowAdvance(target); + } + + @Override + public long cost() { + return in.length(); // good estimation of the cost of iterating over all docs + } + + } + + /** Return the number of documents in this {@link DocIdSet}. This method + * runs in linear time but is much faster than counting documents. */ + public int cardinality() { + int cardinality = 0; + for (Iterator it = iterator(); it.wordNum != Integer.MAX_VALUE; it.nextWord()) { + cardinality += BitUtil.bitCount(it.word); + } + return cardinality; + } + + /** Return the memory usage of this class in bytes. */ + public long ramBytesUsed() { + return RamUsageEstimator.alignObjectSize(3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF) + + RamUsageEstimator.sizeOf(data) + + positions.ramBytesUsed() + + wordNums.ramBytesUsed(); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/util/TestWAH8DocIdSet.java b/lucene/core/src/test/org/apache/lucene/util/TestWAH8DocIdSet.java new file mode 100644 index 00000000000..3ca7ea94ac2 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/TestWAH8DocIdSet.java @@ -0,0 +1,144 @@ +package org.apache.lucene.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; + +public class TestWAH8DocIdSet extends LuceneTestCase { + + private static FixedBitSet randomSet(int numBits, int numBitsSet) { + assert numBitsSet <= numBits; + final FixedBitSet set = new FixedBitSet(numBits); + if (numBitsSet == numBits) { + set.set(0, set.length()); + } else { + for (int i = 0; i < numBitsSet; ++i) { + while (true) { + final int o = random().nextInt(numBits); + if (!set.get(o)) { + set.set(o); + break; + } + } + } + } + return set; + } + + private static FixedBitSet randomSet(int numBits, float percentSet) { + return randomSet(numBits, (int) (percentSet * numBits)); + } + + public void testAgainstFixedBitSet() throws IOException { + final int numBits = _TestUtil.nextInt(random(), 100, 1 << 20); + for (float percentSet : new float[] {0f, 0.0001f, random().nextFloat() / 2, 0.9f, 1f}) { + final FixedBitSet set = randomSet(numBits, percentSet); + final WAH8DocIdSet copy = WAH8DocIdSet.copyOf(set.iterator()); + assertEquals(numBits, set, copy); + } + } + + public void assertEquals(int numBits, FixedBitSet ds1, WAH8DocIdSet ds2) throws IOException { + assertEquals(ds1.cardinality(), ds2.cardinality()); + + // nextDoc + DocIdSetIterator it1 = ds1.iterator(); + DocIdSetIterator it2 = ds2.iterator(); + assertEquals(it1.docID(), it2.docID()); + for (int doc = it1.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it1.nextDoc()) { + assertEquals(doc, it2.nextDoc()); + assertEquals(it1.docID(), it2.docID()); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it2.nextDoc()); + assertEquals(it1.docID(), it2.docID()); + + // nextDoc / advance + it1 = ds1.iterator(); + it2 = ds2.iterator(); + for (int doc = -1; doc != DocIdSetIterator.NO_MORE_DOCS;) { + if (random().nextBoolean()) { + doc = it1.nextDoc(); + assertEquals(doc, it2.nextDoc()); + assertEquals(it1.docID(), it2.docID()); + } else { + final int target = doc + 1 + random().nextInt(random().nextBoolean() ? 64 : numBits / 64); + doc = it1.advance(target); + assertEquals(doc, it2.advance(target)); + assertEquals(it1.docID(), it2.docID()); + } + } + } + + public void testUnion() throws IOException { + final int numBits = _TestUtil.nextInt(random(), 100, 1 << 20); + final int numDocIdSets = _TestUtil.nextInt(random(), 0, 4); + final List fixedSets = new ArrayList(numDocIdSets); + for (int i = 0; i < numDocIdSets; ++i) { + fixedSets.add(randomSet(numBits, random().nextFloat() / 16)); + } + final List compressedSets = new ArrayList(numDocIdSets); + for (FixedBitSet set : fixedSets) { + compressedSets.add(WAH8DocIdSet.copyOf(set.iterator())); + } + + final WAH8DocIdSet union = WAH8DocIdSet.union(compressedSets); + final FixedBitSet expected = new FixedBitSet(numBits); + for (DocIdSet set : fixedSets) { + final DocIdSetIterator it = set.iterator(); + for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { + expected.set(doc); + } + } + assertEquals(numBits, expected, union); + } + + public void testIntersection() throws IOException { + final int numBits = _TestUtil.nextInt(random(), 100, 1 << 20); + final int numDocIdSets = _TestUtil.nextInt(random(), 1, 4); + final List fixedSets = new ArrayList(numDocIdSets); + for (int i = 0; i < numDocIdSets; ++i) { + fixedSets.add(randomSet(numBits, random().nextFloat())); + } + final List compressedSets = new ArrayList(numDocIdSets); + for (FixedBitSet set : fixedSets) { + compressedSets.add(WAH8DocIdSet.copyOf(set.iterator())); + } + + final WAH8DocIdSet union = WAH8DocIdSet.intersect(compressedSets); + final FixedBitSet expected = new FixedBitSet(numBits); + expected.set(0, expected.length()); + for (DocIdSet set : fixedSets) { + final DocIdSetIterator it = set.iterator(); + int lastDoc = -1; + for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { + expected.clear(lastDoc + 1, doc); + lastDoc = doc; + } + if (lastDoc + 1 < expected.length()) { + expected.clear(lastDoc + 1, expected.length()); + } + } + assertEquals(numBits, expected, union); + } + +}