LUCENE-5081: WAH8DocIdSet.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1501925 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2013-07-10 19:24:24 +00:00
parent 7d1e3ba363
commit f45358ebc1
10 changed files with 878 additions and 84 deletions

View File

@ -51,6 +51,9 @@ New features
* LUCENE-5084: Added new Elias-Fano encoder, decoder and DocIdSet * LUCENE-5084: Added new Elias-Fano encoder, decoder and DocIdSet
implementations. (Paul Elschot via Adrien Grand) implementations. (Paul Elschot via Adrien Grand)
* LUCENE-5081: Added WAH8DocIdSet, an in-memory doc id set implementation based
on word-aligned hybrid encoding. (Adrien Grand)
======================= Lucene 4.4.0 ======================= ======================= Lucene 4.4.0 =======================
Changes in backwards compatibility policy Changes in backwards compatibility policy

View File

@ -21,7 +21,6 @@ import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedInts;

View File

@ -44,6 +44,7 @@ import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedInts;

View File

@ -45,6 +45,7 @@ import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.BlockPackedWriter; import org.apache.lucene.util.packed.BlockPackedWriter;

View File

@ -26,6 +26,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.MutableBits; import org.apache.lucene.util.MutableBits;
@ -166,7 +167,7 @@ final class BitVector implements Cloneable, MutableBits {
int c = 0; int c = 0;
int end = bits.length; int end = bits.length;
for (int i = 0; i < end; i++) { for (int i = 0; i < end; i++) {
c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte c += BitUtil.bitCount(bits[i]); // sum bits per byte
} }
count = c; count = c;
} }
@ -179,29 +180,12 @@ final class BitVector implements Cloneable, MutableBits {
int c = 0; int c = 0;
int end = bits.length; int end = bits.length;
for (int i = 0; i < end; i++) { for (int i = 0; i < end; i++) {
c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte c += BitUtil.bitCount(bits[i]); // sum bits per byte
} }
return c; return c;
} }
private static final byte[] BYTE_COUNTS = { // table of bits/byte
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
private static String CODEC = "BitVector"; private static String CODEC = "BitVector";
@ -294,7 +278,7 @@ final class BitVector implements Cloneable, MutableBits {
output.writeVInt(i-last); output.writeVInt(i-last);
output.writeByte(bits[i]); output.writeByte(bits[i]);
last = i; last = i;
numCleared -= (8-BYTE_COUNTS[bits[i] & 0xFF]); numCleared -= (8-BitUtil.bitCount(bits[i]));
assert numCleared >= 0 || (i == (bits.length-1) && numCleared == -(8-(size&7))); assert numCleared >= 0 || (i == (bits.length-1) && numCleared == -(8-(size&7)));
} }
} }
@ -399,7 +383,7 @@ final class BitVector implements Cloneable, MutableBits {
while (n>0) { while (n>0) {
last += input.readVInt(); last += input.readVInt();
bits[last] = input.readByte(); bits[last] = input.readByte();
n -= BYTE_COUNTS[bits[last] & 0xFF]; n -= BitUtil.bitCount(bits[last]);
assert n >= 0; assert n >= 0;
} }
} }
@ -416,7 +400,7 @@ final class BitVector implements Cloneable, MutableBits {
while (numCleared>0) { while (numCleared>0) {
last += input.readVInt(); last += input.readVInt();
bits[last] = input.readByte(); bits[last] = input.readByte();
numCleared -= 8-BYTE_COUNTS[bits[last] & 0xFF]; numCleared -= 8-BitUtil.bitCount(bits[last]);
assert numCleared >= 0 || (last == (bits.length-1) && numCleared == -(8-(size&7))); assert numCleared >= 0 || (last == (bits.length-1) && numCleared == -(8-(size&7)));
} }
} }

View File

@ -22,8 +22,93 @@ package org.apache.lucene.util; // from org.apache.solr.util rev 555343
*/ */
public final class BitUtil { public final class BitUtil {
private static final byte[] BYTE_COUNTS = { // table of bits/byte
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
// The General Idea: instead of having an array per byte that has
// the offsets of the next set bit, that array could be
// packed inside a 32 bit integer (8 4 bit numbers). That
// should be faster than accessing an array for each index, and
// the total array size is kept smaller (256*sizeof(int))=1K
/***** the python code that generated bitlist
def bits2int(val):
arr=0
for shift in range(8,0,-1):
if val & 0x80:
arr = (arr << 4) | shift
val = val << 1
return arr
def int_table():
tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ]
return ','.join(tbl)
******/
private static final int[] BIT_LISTS = {
0x0, 0x1, 0x2, 0x21, 0x3, 0x31, 0x32, 0x321, 0x4, 0x41, 0x42, 0x421, 0x43,
0x431, 0x432, 0x4321, 0x5, 0x51, 0x52, 0x521, 0x53, 0x531, 0x532, 0x5321,
0x54, 0x541, 0x542, 0x5421, 0x543, 0x5431, 0x5432, 0x54321, 0x6, 0x61, 0x62,
0x621, 0x63, 0x631, 0x632, 0x6321, 0x64, 0x641, 0x642, 0x6421, 0x643,
0x6431, 0x6432, 0x64321, 0x65, 0x651, 0x652, 0x6521, 0x653, 0x6531, 0x6532,
0x65321, 0x654, 0x6541, 0x6542, 0x65421, 0x6543, 0x65431, 0x65432, 0x654321,
0x7, 0x71, 0x72, 0x721, 0x73, 0x731, 0x732, 0x7321, 0x74, 0x741, 0x742,
0x7421, 0x743, 0x7431, 0x7432, 0x74321, 0x75, 0x751, 0x752, 0x7521, 0x753,
0x7531, 0x7532, 0x75321, 0x754, 0x7541, 0x7542, 0x75421, 0x7543, 0x75431,
0x75432, 0x754321, 0x76, 0x761, 0x762, 0x7621, 0x763, 0x7631, 0x7632,
0x76321, 0x764, 0x7641, 0x7642, 0x76421, 0x7643, 0x76431, 0x76432, 0x764321,
0x765, 0x7651, 0x7652, 0x76521, 0x7653, 0x76531, 0x76532, 0x765321, 0x7654,
0x76541, 0x76542, 0x765421, 0x76543, 0x765431, 0x765432, 0x7654321, 0x8,
0x81, 0x82, 0x821, 0x83, 0x831, 0x832, 0x8321, 0x84, 0x841, 0x842, 0x8421,
0x843, 0x8431, 0x8432, 0x84321, 0x85, 0x851, 0x852, 0x8521, 0x853, 0x8531,
0x8532, 0x85321, 0x854, 0x8541, 0x8542, 0x85421, 0x8543, 0x85431, 0x85432,
0x854321, 0x86, 0x861, 0x862, 0x8621, 0x863, 0x8631, 0x8632, 0x86321, 0x864,
0x8641, 0x8642, 0x86421, 0x8643, 0x86431, 0x86432, 0x864321, 0x865, 0x8651,
0x8652, 0x86521, 0x8653, 0x86531, 0x86532, 0x865321, 0x8654, 0x86541,
0x86542, 0x865421, 0x86543, 0x865431, 0x865432, 0x8654321, 0x87, 0x871,
0x872, 0x8721, 0x873, 0x8731, 0x8732, 0x87321, 0x874, 0x8741, 0x8742,
0x87421, 0x8743, 0x87431, 0x87432, 0x874321, 0x875, 0x8751, 0x8752, 0x87521,
0x8753, 0x87531, 0x87532, 0x875321, 0x8754, 0x87541, 0x87542, 0x875421,
0x87543, 0x875431, 0x875432, 0x8754321, 0x876, 0x8761, 0x8762, 0x87621,
0x8763, 0x87631, 0x87632, 0x876321, 0x8764, 0x87641, 0x87642, 0x876421,
0x87643, 0x876431, 0x876432, 0x8764321, 0x8765, 0x87651, 0x87652, 0x876521,
0x87653, 0x876531, 0x876532, 0x8765321, 0x87654, 0x876541, 0x876542,
0x8765421, 0x876543, 0x8765431, 0x8765432, 0x87654321
};
private BitUtil() {} // no instance private BitUtil() {} // no instance
/** Return the number of bits sets in b. */
public static int bitCount(byte b) {
return BYTE_COUNTS[b & 0xFF];
}
/** Return the list of bits which are set in b encoded as followed:
* <code>(i >>> (4 * n)) & 0x0F</code> is the offset of the n-th set bit of
* the given byte plus one, or 0 if there are n or less bits set in the given
* byte. For example <code>bitList(12)</code> returns 0x43:<ul>
* <li><code>0x43 & 0x0F</code> is 3, meaning the the first bit set is at offset 3-1 = 2,</li>
* <li><code>(0x43 >>> 4) & 0x0F</code> is 4, meaning there is a second bit set at offset 4-1=3,</li>
* <li><code>(0x43 >>> 8) & 0x0F</code> is 0, meaning there is no more bit set in this byte.</li>
* </ul>*/
public static int bitList(byte b) {
return BIT_LISTS[b & 0xFF];
}
// The pop methods used to rely on bit-manipulation tricks for speed but it // The pop methods used to rely on bit-manipulation tricks for speed but it
// turns out that it is faster to use the Long.bitCount method (which is an // turns out that it is faster to use the Long.bitCount method (which is an
// intrinsic since Java 6u18) in a naive loop, see LUCENE-2221 // intrinsic since Java 6u18) in a naive loop, see LUCENE-2221

View File

@ -1,4 +1,4 @@
package org.apache.lucene.codecs.compressing; package org.apache.lucene.util;
/* /*
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
@ -17,26 +17,27 @@ package org.apache.lucene.codecs.compressing;
* limitations under the License. * limitations under the License.
*/ */
import java.io.IOException;
import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;
/** /**
* A {@link DataOutput} that can be used to build a byte[]. * A {@link DataOutput} that can be used to build a byte[].
* @lucene.internal
*/ */
final class GrowableByteArrayDataOutput extends DataOutput { public final class GrowableByteArrayDataOutput extends DataOutput {
byte[] bytes; /** The bytes */
int length; public byte[] bytes;
/** The length */
public int length;
GrowableByteArrayDataOutput(int cp) { /** Create a {@link GrowableByteArrayDataOutput} with the given initial capacity. */
public GrowableByteArrayDataOutput(int cp) {
this.bytes = new byte[ArrayUtil.oversize(cp, 1)]; this.bytes = new byte[ArrayUtil.oversize(cp, 1)];
this.length = 0; this.length = 0;
} }
@Override @Override
public void writeByte(byte b) throws IOException { public void writeByte(byte b) {
if (length >= bytes.length) { if (length >= bytes.length) {
bytes = ArrayUtil.grow(bytes); bytes = ArrayUtil.grow(bytes);
} }
@ -44,7 +45,7 @@ final class GrowableByteArrayDataOutput extends DataOutput {
} }
@Override @Override
public void writeBytes(byte[] b, int off, int len) throws IOException { public void writeBytes(byte[] b, int off, int len) {
final int newLength = length + len; final int newLength = length + len;
if (newLength > bytes.length) { if (newLength > bytes.length) {
bytes = ArrayUtil.grow(bytes, newLength); bytes = ArrayUtil.grow(bytes, newLength);

View File

@ -25,55 +25,6 @@ import org.apache.lucene.search.DocIdSetIterator;
*/ */
public class OpenBitSetIterator extends DocIdSetIterator { public class OpenBitSetIterator extends DocIdSetIterator {
// The General Idea: instead of having an array per byte that has
// the offsets of the next set bit, that array could be
// packed inside a 32 bit integer (8 4 bit numbers). That
// should be faster than accessing an array for each index, and
// the total array size is kept smaller (256*sizeof(int))=1K
protected final static int[] bitlist={
0x0, 0x1, 0x2, 0x21, 0x3, 0x31, 0x32, 0x321, 0x4, 0x41, 0x42, 0x421, 0x43,
0x431, 0x432, 0x4321, 0x5, 0x51, 0x52, 0x521, 0x53, 0x531, 0x532, 0x5321,
0x54, 0x541, 0x542, 0x5421, 0x543, 0x5431, 0x5432, 0x54321, 0x6, 0x61, 0x62,
0x621, 0x63, 0x631, 0x632, 0x6321, 0x64, 0x641, 0x642, 0x6421, 0x643,
0x6431, 0x6432, 0x64321, 0x65, 0x651, 0x652, 0x6521, 0x653, 0x6531, 0x6532,
0x65321, 0x654, 0x6541, 0x6542, 0x65421, 0x6543, 0x65431, 0x65432, 0x654321,
0x7, 0x71, 0x72, 0x721, 0x73, 0x731, 0x732, 0x7321, 0x74, 0x741, 0x742,
0x7421, 0x743, 0x7431, 0x7432, 0x74321, 0x75, 0x751, 0x752, 0x7521, 0x753,
0x7531, 0x7532, 0x75321, 0x754, 0x7541, 0x7542, 0x75421, 0x7543, 0x75431,
0x75432, 0x754321, 0x76, 0x761, 0x762, 0x7621, 0x763, 0x7631, 0x7632,
0x76321, 0x764, 0x7641, 0x7642, 0x76421, 0x7643, 0x76431, 0x76432, 0x764321,
0x765, 0x7651, 0x7652, 0x76521, 0x7653, 0x76531, 0x76532, 0x765321, 0x7654,
0x76541, 0x76542, 0x765421, 0x76543, 0x765431, 0x765432, 0x7654321, 0x8,
0x81, 0x82, 0x821, 0x83, 0x831, 0x832, 0x8321, 0x84, 0x841, 0x842, 0x8421,
0x843, 0x8431, 0x8432, 0x84321, 0x85, 0x851, 0x852, 0x8521, 0x853, 0x8531,
0x8532, 0x85321, 0x854, 0x8541, 0x8542, 0x85421, 0x8543, 0x85431, 0x85432,
0x854321, 0x86, 0x861, 0x862, 0x8621, 0x863, 0x8631, 0x8632, 0x86321, 0x864,
0x8641, 0x8642, 0x86421, 0x8643, 0x86431, 0x86432, 0x864321, 0x865, 0x8651,
0x8652, 0x86521, 0x8653, 0x86531, 0x86532, 0x865321, 0x8654, 0x86541,
0x86542, 0x865421, 0x86543, 0x865431, 0x865432, 0x8654321, 0x87, 0x871,
0x872, 0x8721, 0x873, 0x8731, 0x8732, 0x87321, 0x874, 0x8741, 0x8742,
0x87421, 0x8743, 0x87431, 0x87432, 0x874321, 0x875, 0x8751, 0x8752, 0x87521,
0x8753, 0x87531, 0x87532, 0x875321, 0x8754, 0x87541, 0x87542, 0x875421,
0x87543, 0x875431, 0x875432, 0x8754321, 0x876, 0x8761, 0x8762, 0x87621,
0x8763, 0x87631, 0x87632, 0x876321, 0x8764, 0x87641, 0x87642, 0x876421,
0x87643, 0x876431, 0x876432, 0x8764321, 0x8765, 0x87651, 0x87652, 0x876521,
0x87653, 0x876531, 0x876532, 0x8765321, 0x87654, 0x876541, 0x876542,
0x8765421, 0x876543, 0x8765431, 0x8765432, 0x87654321
};
/***** the python code that generated bitlist
def bits2int(val):
arr=0
for shift in range(8,0,-1):
if val & 0x80:
arr = (arr << 4) | shift
val = val << 1
return arr
def int_table():
tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ]
return ','.join(tbl)
******/
// hmmm, what about an iterator that finds zeros though, // hmmm, what about an iterator that finds zeros though,
// or a reverse iterator... should they be separate classes // or a reverse iterator... should they be separate classes
// for efficiency, or have a common root interface? (or // for efficiency, or have a common root interface? (or
@ -101,7 +52,7 @@ public class OpenBitSetIterator extends DocIdSetIterator {
if ((int)word ==0) {wordShift +=32; word = word >>>32; } if ((int)word ==0) {wordShift +=32; word = word >>>32; }
if ((word & 0x0000FFFF) == 0) { wordShift +=16; word >>>=16; } if ((word & 0x0000FFFF) == 0) { wordShift +=16; word >>>=16; }
if ((word & 0x000000FF) == 0) { wordShift +=8; word >>>=8; } if ((word & 0x000000FF) == 0) { wordShift +=8; word >>>=8; }
indexArray = bitlist[(int)word & 0xff]; indexArray = BitUtil.bitList((byte) word);
} }
/***** alternate shift implementations /***** alternate shift implementations

View File

@ -0,0 +1,625 @@
package org.apache.lucene.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.packed.PackedInts;
/**
* {@link DocIdSet} implementation based on word-aligned hybrid encoding on
* words of 8 bits.
* <p>This implementation doesn't support random-access but has a fast
* {@link DocIdSetIterator} which can advance in logarithmic time thanks to
* an index.</p>
* <p>The compression scheme is simplistic and should work well with sparse doc
* id sets while being only slightly larger than a {@link FixedBitSet} for
* incompressible sets (overhead&lt;2% in the worst case) in spite of the index.</p>
* <p><b>Format</b>: The format is byte-aligned. An 8-bits word is either clean,
* meaning composed only of zeros, or dirty, meaning that it contains at least one
* bit set. The idea is to encode sequences of clean words using run-length
* encoding and to leave sequences of dirty words as-is.</p>
* <table>
* <tr><th>Token</th><th>Clean length+</th><th>Dirty length+</th><th>Dirty words</th></tr>
* <tr><td>1 byte</td><td>0-n bytes</td><td>0-n bytes</td><td>0-n bytes</td></tr>
* </table>
* <ul>
* <li><b>Token</b> encodes the number of clean words minus 2 on the first 4
* bits and the number of dirty words minus 1 on the last 4 bits. The
* higher-order bit is a continuation bit, meaning that the number is incomplete
* and needs additional bytes to be read.</li>
* <li><b>Clean length+</b>: If clean length has its higher-order bit set,
* you need to read a {@link DataInput#readVInt() vint}, shift it by 3 bits on
* the left side and add it to the 3 bits which have been read in the token.</li>
* <li><b>Dirty length+</b> works the same way as <b>Clean length+</b> but
* for the length of dirty words.</li>
* <li><b>Dirty words</b> are the dirty words, there are <b>Dirty length</b>
* of them.</li>
* </ul>
* <p>This format cannot encode sequences of less than 2 clean words and 1 dirty
* word. The reason is that if you find a single clean word, you should rather
* encode it as a dirty word. This takes the same space as starting a new
* sequence (since you need one byte for the token) but will be lighter to
* decode. There is however an exception for the first sequence. Since the first
* sequence may start directly with a dirty word, the clean length is encoded
* directly, without subtracting 2.</p>
* <p>There is an additional restriction on the format: the sequence of dirty
* words must start and end with a non-null word and is not allowed to contain
* two consecutive null words. This restriction exists to make sure no space is
* wasted and to make sure iterators can read the next doc ID by reading at most
* 2 dirty words.</p>
* @lucene.experimental
*/
public final class WAH8DocIdSet extends DocIdSet {
// Minimum index interval, intervals below this value can't guarantee anymore
// that this set implementation won't be significantly larger than a FixedBitSet
// The reason is that a single sequence saves at least one byte and an index
// entry requires at most 8 bytes (2 ints) so there shouldn't be more than one
// index entry every 8 sequences
private static final int MIN_INDEX_INTERVAL = 8;
/** Default index interval. */
// To compute this default value, I created a rather dense set (0.1% load
// factor, which is close to the worst case regarding both compression and
// speed for this DocIdSet impl since sequences are going to be short) and I
// started with interval=1 and doubled it at each iteration until advance
// became slower
public static final int DEFAULT_INDEX_INTERVAL = 16;
private static final PackedInts.Reader EMPTY_READER = new PackedInts.NullReader(1);
private static WAH8DocIdSet EMPTY = new WAH8DocIdSet(new byte[0], EMPTY_READER, EMPTY_READER);
private static final Comparator<Iterator> SERIALIZED_LENGTH_COMPARATOR = new Comparator<Iterator>() {
@Override
public int compare(Iterator wi1, Iterator wi2) {
return wi1.in.length() - wi2.in.length();
}
};
/** Same as {@link #copyOf(DocIdSetIterator, int)} with the default index interval. */
public static WAH8DocIdSet copyOf(DocIdSetIterator it) throws IOException {
return copyOf(it, DEFAULT_INDEX_INTERVAL);
}
/** Return a copy of the provided iterator. */
public static WAH8DocIdSet copyOf(DocIdSetIterator it, int indexInterval) throws IOException {
Builder builder = new Builder().setIndexInterval(indexInterval);
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
builder.add(doc);
}
return builder.build();
}
/** Same as {@link #intersect(Collection, int)} with the default index interval. */
public static WAH8DocIdSet intersect(Collection<WAH8DocIdSet> docIdSets) {
return intersect(docIdSets, DEFAULT_INDEX_INTERVAL);
}
/**
* Compute the intersection of the provided sets. This method is much faster than
* computing the intersection manually since it operates directly at the byte level.
*/
public static WAH8DocIdSet intersect(Collection<WAH8DocIdSet> docIdSets, int indexInterval) {
switch (docIdSets.size()) {
case 0:
throw new IllegalArgumentException("There must be at least one set to intersect");
case 1:
return docIdSets.iterator().next();
}
// The logic below is similar to ConjunctionScorer
final int numSets = docIdSets.size();
final Iterator[] iterators = new Iterator[numSets];
int i = 0;
for (WAH8DocIdSet set : docIdSets) {
final Iterator it = set.iterator();
iterators[i++] = it;
}
Arrays.sort(iterators, SERIALIZED_LENGTH_COMPARATOR);
final WordBuilder builder = new WordBuilder().setIndexInterval(indexInterval);
int wordNum = 0;
main:
while (true) {
// Advance the least costly iterator first
iterators[0].advanceWord(wordNum);
wordNum = iterators[0].wordNum;
if (wordNum == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
byte word = iterators[0].word;
for (i = 1; i < numSets; ++i) {
if (iterators[i].wordNum < wordNum) {
iterators[i].advanceWord(wordNum);
}
if (iterators[i].wordNum > wordNum) {
wordNum = iterators[i].wordNum;
continue main;
}
assert iterators[i].wordNum == wordNum;
word &= iterators[i].word;
if (word == 0) {
// There are common words, but they don't share any bit
++wordNum;
continue main;
}
}
// Found a common word
assert word != 0;
builder.addWord(wordNum, word);
++wordNum;
}
return builder.build();
}
/** Same as {@link #union(Collection, int)} with the default index interval. */
public static WAH8DocIdSet union(Collection<WAH8DocIdSet> docIdSets) {
return union(docIdSets, DEFAULT_INDEX_INTERVAL);
}
/**
* Compute the union of the provided sets. This method is much faster than
* computing the union manually since it operates directly at the byte level.
*/
public static WAH8DocIdSet union(Collection<WAH8DocIdSet> docIdSets, int indexInterval) {
switch (docIdSets.size()) {
case 0:
return EMPTY;
case 1:
return docIdSets.iterator().next();
}
// The logic below is very similar to DisjunctionScorer
final int numSets = docIdSets.size();
final PriorityQueue<Iterator> iterators = new PriorityQueue<WAH8DocIdSet.Iterator>(numSets) {
@Override
protected boolean lessThan(Iterator a, Iterator b) {
return a.wordNum < b.wordNum;
}
};
for (WAH8DocIdSet set : docIdSets) {
Iterator iterator = set.iterator();
iterator.nextWord();
iterators.add(iterator);
}
Iterator top = iterators.top();
if (top.wordNum == Integer.MAX_VALUE) {
return EMPTY;
}
int wordNum = top.wordNum;
byte word = top.word;
final WordBuilder builder = new WordBuilder().setIndexInterval(indexInterval);
while (true) {
top.nextWord();
iterators.updateTop();
top = iterators.top();
if (top.wordNum == wordNum) {
word |= top.word;
} else {
builder.addWord(wordNum, word);
if (top.wordNum == Integer.MAX_VALUE) {
break;
}
wordNum = top.wordNum;
word = top.word;
}
}
return builder.build();
}
static int wordNum(int docID) {
assert docID >= 0;
return docID >>> 3;
}
/** Word-based builder. */
static class WordBuilder {
final GrowableByteArrayDataOutput out;
final GrowableByteArrayDataOutput dirtyWords;
int clean;
int lastWordNum;
int numSequences;
int indexInterval;
WordBuilder() {
out = new GrowableByteArrayDataOutput(1024);
dirtyWords = new GrowableByteArrayDataOutput(128);
clean = 0;
lastWordNum = -1;
numSequences = 0;
indexInterval = DEFAULT_INDEX_INTERVAL;
}
/** Set the index interval. Smaller index intervals improve performance of
* {@link DocIdSetIterator#advance(int)} but make the {@link DocIdSet}
* larger. An index interval <code>i</code> makes the index add an overhead
* which is at most <code>4/i</code>, but likely much less.The default index
* interval is <code>16</code>, meaning the index has an overhead of at most
* 25%. To disable indexing, you can pass {@link Integer#MAX_VALUE} as an
* index interval. */
public WordBuilder setIndexInterval(int indexInterval) {
if (indexInterval < MIN_INDEX_INTERVAL) {
throw new IllegalArgumentException("indexInterval must be >= " + MIN_INDEX_INTERVAL);
}
this.indexInterval = indexInterval;
return this;
}
void writeHeader(int cleanLength) throws IOException {
final int cleanLengthMinus2 = cleanLength - 2;
final int dirtyLengthMinus1 = dirtyWords.length - 1;
assert cleanLengthMinus2 >= 0;
assert dirtyLengthMinus1 >= 0;
int token = ((cleanLengthMinus2 & 0x07) << 4) | (dirtyLengthMinus1 & 0x07);
if (cleanLengthMinus2 > 0x07) {
token |= 1 << 7;
}
if (dirtyLengthMinus1 > 0x07) {
token |= 1 << 3;
}
out.writeByte((byte) token);
if (cleanLengthMinus2 > 0x07) {
out.writeVInt(cleanLengthMinus2 >>> 3);
}
if (dirtyLengthMinus1 > 0x07) {
out.writeVInt(dirtyLengthMinus1 >>> 3);
}
}
void writeSequence(int cleanLength) {
try {
writeHeader(cleanLength);
out.writeBytes(dirtyWords.bytes, dirtyWords.length);
} catch (IOException cannotHappen) {
throw new AssertionError(cannotHappen);
}
dirtyWords.length = 0;
++numSequences;
}
void addWord(int wordNum, byte word) {
assert wordNum > lastWordNum;
assert word != 0;
if (lastWordNum == -1) {
clean = 2 + wordNum; // special case for the 1st sequence
dirtyWords.writeByte(word);
} else {
switch (wordNum - lastWordNum) {
case 1:
dirtyWords.writeByte(word);
break;
case 2:
dirtyWords.writeByte((byte) 0);
dirtyWords.writeByte(word);
break;
default:
writeSequence(clean);
clean = wordNum - lastWordNum - 1;
dirtyWords.writeByte(word);
}
}
lastWordNum = wordNum;
}
/** Build a new {@link WAH8DocIdSet}. */
public WAH8DocIdSet build() {
if (lastWordNum == -1) {
return EMPTY;
}
writeSequence(clean);
final byte[] data = Arrays.copyOf(out.bytes, out.length);
// Now build the index
final int valueCount = (numSequences - 1) / indexInterval + 1;
final PackedInts.Reader indexPositions;
final PackedInts.Reader indexWordNums;
if (valueCount <= 1) {
indexPositions = indexWordNums = EMPTY_READER;
} else {
// From the tests I ran, there is no need to expose acceptableOverheadRatio, these packed ints are never the bottleneck
final PackedInts.Mutable positions = PackedInts.getMutable(valueCount, PackedInts.bitsRequired(data.length - 1), PackedInts.COMPACT);
final PackedInts.Mutable wordNums = PackedInts.getMutable(valueCount, PackedInts.bitsRequired(lastWordNum), PackedInts.COMPACT);
final Iterator it = new Iterator(data, null, null);
assert it.in.getPosition() == 0;
assert it.wordNum == -1;
for (int i = 1; i < valueCount; ++i) {
// skip indexInterval sequences
for (int j = 0; j < indexInterval; ++j) {
final boolean readSequence = it.readSequence();
assert readSequence;
it.skipDirtyBytes();
}
final int position = it.in.getPosition();
final int wordNum = it.wordNum;
positions.set(i, position);
wordNums.set(i, wordNum + 1);
}
indexPositions = positions;
indexWordNums = wordNums;
}
return new WAH8DocIdSet(data, indexPositions, indexWordNums);
}
}
/** A builder for {@link WAH8DocIdSet}s. */
public static final class Builder extends WordBuilder {
private int lastDocID;
private int wordNum, word;
/** Sole constructor */
public Builder() {
super();
lastDocID = -1;
wordNum = -1;
word = 0;
}
/** Add a document to this builder. Documents must be added in order. */
public Builder add(int docID) {
if (docID <= lastDocID) {
throw new IllegalArgumentException("Doc ids must be added in-order, got " + docID + " which is <= lastDocID=" + lastDocID);
}
final int wordNum = wordNum(docID);
if (this.wordNum == -1) {
this.wordNum = wordNum;
word = 1 << (docID & 0x07);
} else if (wordNum == this.wordNum) {
word |= 1 << (docID & 0x07);
} else {
addWord(this.wordNum, (byte) word);
this.wordNum = wordNum;
word = 1 << (docID & 0x07);
}
lastDocID = docID;
return this;
}
@Override
public Builder setIndexInterval(int indexInterval) {
return (Builder) super.setIndexInterval(indexInterval);
}
@Override
public WAH8DocIdSet build() {
if (this.wordNum != -1) {
addWord(wordNum, (byte) word);
}
return super.build();
}
}
// where the doc IDs are stored
private final byte[] data;
// index for advance(int)
private final PackedInts.Reader positions, wordNums; // wordNums[i] starts at the sequence at positions[i]
WAH8DocIdSet(byte[] data, PackedInts.Reader positions, PackedInts.Reader wordNums) {
this.data = data;
this.positions = positions;
this.wordNums = wordNums;
}
@Override
public boolean isCacheable() {
return true;
}
@Override
public Iterator iterator() {
return new Iterator(data, positions, wordNums);
}
static int readLength(ByteArrayDataInput in, int len) {
if ((len & 0x08) == 0) {
// no continuation bit
return len;
}
return (len & 0x07) | (in.readVInt() << 3);
}
static class Iterator extends DocIdSetIterator {
final ByteArrayDataInput in;
final PackedInts.Reader positions, wordNums;
int dirtyLength;
int wordNum; // byte offset
byte word; // current word
int bitList; // list of bits set in the current word
int docID;
Iterator(byte[] data, PackedInts.Reader positions, PackedInts.Reader wordNums) {
this.in = new ByteArrayDataInput(data);
this.positions = positions;
this.wordNums = wordNums;
wordNum = -1;
word = 0;
bitList = 0;
docID = -1;
}
boolean readSequence() {
if (in.eof()) {
wordNum = Integer.MAX_VALUE;
return false;
}
final int token = in.readByte() & 0xFF;
final int cleanLength = (in.getPosition() == 1 ? 0 : 2) + readLength(in, token >>> 4);
wordNum += cleanLength;
dirtyLength = 1 + readLength(in, token & 0x0F);
return true;
}
void skipDirtyBytes(int count) {
assert count >= 0;
assert count <= dirtyLength;
in.skipBytes(count);
wordNum += count;
dirtyLength -= count;
}
void skipDirtyBytes() {
in.skipBytes(dirtyLength);
wordNum += dirtyLength;
dirtyLength = 0;
}
void nextWord() {
if (dirtyLength == 0 && !readSequence()) {
return;
}
word = in.readByte();
if (word == 0) {
word = in.readByte();
assert word != 0; // there can never be two consecutive null dirty words
++wordNum;
--dirtyLength;
}
++wordNum;
--dirtyLength;
}
int binarySearch(int targetWordNum) {
int lo = 0, hi = positions.size() - 1;
while (lo <= hi) {
final int mid = (lo + hi) >>> 1;
final int midWordNum = (int) wordNums.get(mid);
if (midWordNum <= targetWordNum) {
lo = mid + 1;
} else {
hi = mid - 1;
}
}
assert wordNums.get(hi) <= targetWordNum;
assert hi+1 == wordNums.size() || wordNums.get(hi + 1) > targetWordNum;
return hi;
}
void advanceWord(int targetWordNum) {
assert targetWordNum > wordNum;
if (dirtyLength == 0 && !readSequence()) {
return;
}
int delta = targetWordNum - wordNum;
if (delta <= dirtyLength + 1) {
if (delta > 1) {
skipDirtyBytes(delta - 1);
}
} else {
skipDirtyBytes();
assert dirtyLength == 0;
// use the index
final int i = binarySearch(targetWordNum);
final int position = (int) positions.get(i);
if (position > in.getPosition()) { // if the binary search returned a backward offset, don't move
wordNum = (int) wordNums.get(i) - 1;
in.setPosition(position);
}
while (true) {
if (!readSequence()) {
return;
}
delta = targetWordNum - wordNum;
if (delta <= dirtyLength + 1) {
if (delta > 1) {
skipDirtyBytes(delta - 1);
}
break;
}
skipDirtyBytes();
}
}
nextWord();
}
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() throws IOException {
if (bitList != 0) { // there are remaining bits in the current word
docID = (wordNum << 3) | ((bitList & 0x0F) - 1);
bitList >>>= 4;
return docID;
}
nextWord();
if (wordNum == Integer.MAX_VALUE) {
return docID = NO_MORE_DOCS;
}
bitList = BitUtil.bitList(word);
assert bitList != 0;
docID = (wordNum << 3) | ((bitList & 0x0F) - 1);
bitList >>>= 4;
return docID;
}
@Override
public int advance(int target) throws IOException {
assert target > docID;
final int targetWordNum = wordNum(target);
if (targetWordNum > wordNum) {
advanceWord(targetWordNum);
bitList = BitUtil.bitList(word);
}
return slowAdvance(target);
}
@Override
public long cost() {
return in.length(); // good estimation of the cost of iterating over all docs
}
}
/** Return the number of documents in this {@link DocIdSet}. This method
* runs in linear time but is much faster than counting documents. */
public int cardinality() {
int cardinality = 0;
for (Iterator it = iterator(); it.wordNum != Integer.MAX_VALUE; it.nextWord()) {
cardinality += BitUtil.bitCount(it.word);
}
return cardinality;
}
/** Return the memory usage of this class in bytes. */
public long ramBytesUsed() {
return RamUsageEstimator.alignObjectSize(3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF)
+ RamUsageEstimator.sizeOf(data)
+ positions.ramBytesUsed()
+ wordNums.ramBytesUsed();
}
}

View File

@ -0,0 +1,144 @@
package org.apache.lucene.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
public class TestWAH8DocIdSet extends LuceneTestCase {
private static FixedBitSet randomSet(int numBits, int numBitsSet) {
assert numBitsSet <= numBits;
final FixedBitSet set = new FixedBitSet(numBits);
if (numBitsSet == numBits) {
set.set(0, set.length());
} else {
for (int i = 0; i < numBitsSet; ++i) {
while (true) {
final int o = random().nextInt(numBits);
if (!set.get(o)) {
set.set(o);
break;
}
}
}
}
return set;
}
private static FixedBitSet randomSet(int numBits, float percentSet) {
return randomSet(numBits, (int) (percentSet * numBits));
}
public void testAgainstFixedBitSet() throws IOException {
final int numBits = _TestUtil.nextInt(random(), 100, 1 << 20);
for (float percentSet : new float[] {0f, 0.0001f, random().nextFloat() / 2, 0.9f, 1f}) {
final FixedBitSet set = randomSet(numBits, percentSet);
final WAH8DocIdSet copy = WAH8DocIdSet.copyOf(set.iterator());
assertEquals(numBits, set, copy);
}
}
public void assertEquals(int numBits, FixedBitSet ds1, WAH8DocIdSet ds2) throws IOException {
assertEquals(ds1.cardinality(), ds2.cardinality());
// nextDoc
DocIdSetIterator it1 = ds1.iterator();
DocIdSetIterator it2 = ds2.iterator();
assertEquals(it1.docID(), it2.docID());
for (int doc = it1.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it1.nextDoc()) {
assertEquals(doc, it2.nextDoc());
assertEquals(it1.docID(), it2.docID());
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, it2.nextDoc());
assertEquals(it1.docID(), it2.docID());
// nextDoc / advance
it1 = ds1.iterator();
it2 = ds2.iterator();
for (int doc = -1; doc != DocIdSetIterator.NO_MORE_DOCS;) {
if (random().nextBoolean()) {
doc = it1.nextDoc();
assertEquals(doc, it2.nextDoc());
assertEquals(it1.docID(), it2.docID());
} else {
final int target = doc + 1 + random().nextInt(random().nextBoolean() ? 64 : numBits / 64);
doc = it1.advance(target);
assertEquals(doc, it2.advance(target));
assertEquals(it1.docID(), it2.docID());
}
}
}
public void testUnion() throws IOException {
final int numBits = _TestUtil.nextInt(random(), 100, 1 << 20);
final int numDocIdSets = _TestUtil.nextInt(random(), 0, 4);
final List<FixedBitSet> fixedSets = new ArrayList<FixedBitSet>(numDocIdSets);
for (int i = 0; i < numDocIdSets; ++i) {
fixedSets.add(randomSet(numBits, random().nextFloat() / 16));
}
final List<WAH8DocIdSet> compressedSets = new ArrayList<WAH8DocIdSet>(numDocIdSets);
for (FixedBitSet set : fixedSets) {
compressedSets.add(WAH8DocIdSet.copyOf(set.iterator()));
}
final WAH8DocIdSet union = WAH8DocIdSet.union(compressedSets);
final FixedBitSet expected = new FixedBitSet(numBits);
for (DocIdSet set : fixedSets) {
final DocIdSetIterator it = set.iterator();
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
expected.set(doc);
}
}
assertEquals(numBits, expected, union);
}
public void testIntersection() throws IOException {
final int numBits = _TestUtil.nextInt(random(), 100, 1 << 20);
final int numDocIdSets = _TestUtil.nextInt(random(), 1, 4);
final List<FixedBitSet> fixedSets = new ArrayList<FixedBitSet>(numDocIdSets);
for (int i = 0; i < numDocIdSets; ++i) {
fixedSets.add(randomSet(numBits, random().nextFloat()));
}
final List<WAH8DocIdSet> compressedSets = new ArrayList<WAH8DocIdSet>(numDocIdSets);
for (FixedBitSet set : fixedSets) {
compressedSets.add(WAH8DocIdSet.copyOf(set.iterator()));
}
final WAH8DocIdSet union = WAH8DocIdSet.intersect(compressedSets);
final FixedBitSet expected = new FixedBitSet(numBits);
expected.set(0, expected.length());
for (DocIdSet set : fixedSets) {
final DocIdSetIterator it = set.iterator();
int lastDoc = -1;
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
expected.clear(lastDoc + 1, doc);
lastDoc = doc;
}
if (lastDoc + 1 < expected.length()) {
expected.clear(lastDoc + 1, expected.length());
}
}
assertEquals(numBits, expected, union);
}
}