mirror of https://github.com/apache/lucene.git
LUCENE-5081: WAH8DocIdSet.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1501925 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7d1e3ba363
commit
f45358ebc1
|
@ -51,6 +51,9 @@ New features
|
|||
* LUCENE-5084: Added new Elias-Fano encoder, decoder and DocIdSet
|
||||
implementations. (Paul Elschot via Adrien Grand)
|
||||
|
||||
* LUCENE-5081: Added WAH8DocIdSet, an in-memory doc id set implementation based
|
||||
on word-aligned hybrid encoding. (Adrien Grand)
|
||||
|
||||
======================= Lucene 4.4.0 =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.io.Closeable;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
|
|
@ -44,6 +44,7 @@ import org.apache.lucene.store.IndexOutput;
|
|||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.GrowableByteArrayDataOutput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
|
|
|
@ -45,6 +45,7 @@ import org.apache.lucene.store.IndexOutput;
|
|||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.GrowableByteArrayDataOutput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.packed.BlockPackedWriter;
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.store.Directory;
|
|||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BitUtil;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.MutableBits;
|
||||
|
||||
|
@ -166,7 +167,7 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
int c = 0;
|
||||
int end = bits.length;
|
||||
for (int i = 0; i < end; i++) {
|
||||
c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
|
||||
c += BitUtil.bitCount(bits[i]); // sum bits per byte
|
||||
}
|
||||
count = c;
|
||||
}
|
||||
|
@ -179,29 +180,12 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
int c = 0;
|
||||
int end = bits.length;
|
||||
for (int i = 0; i < end; i++) {
|
||||
c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
|
||||
c += BitUtil.bitCount(bits[i]); // sum bits per byte
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
private static final byte[] BYTE_COUNTS = { // table of bits/byte
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
|
||||
};
|
||||
|
||||
|
||||
private static String CODEC = "BitVector";
|
||||
|
||||
|
@ -294,7 +278,7 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
output.writeVInt(i-last);
|
||||
output.writeByte(bits[i]);
|
||||
last = i;
|
||||
numCleared -= (8-BYTE_COUNTS[bits[i] & 0xFF]);
|
||||
numCleared -= (8-BitUtil.bitCount(bits[i]));
|
||||
assert numCleared >= 0 || (i == (bits.length-1) && numCleared == -(8-(size&7)));
|
||||
}
|
||||
}
|
||||
|
@ -399,7 +383,7 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
while (n>0) {
|
||||
last += input.readVInt();
|
||||
bits[last] = input.readByte();
|
||||
n -= BYTE_COUNTS[bits[last] & 0xFF];
|
||||
n -= BitUtil.bitCount(bits[last]);
|
||||
assert n >= 0;
|
||||
}
|
||||
}
|
||||
|
@ -416,7 +400,7 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
while (numCleared>0) {
|
||||
last += input.readVInt();
|
||||
bits[last] = input.readByte();
|
||||
numCleared -= 8-BYTE_COUNTS[bits[last] & 0xFF];
|
||||
numCleared -= 8-BitUtil.bitCount(bits[last]);
|
||||
assert numCleared >= 0 || (last == (bits.length-1) && numCleared == -(8-(size&7)));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,8 +22,93 @@ package org.apache.lucene.util; // from org.apache.solr.util rev 555343
|
|||
*/
|
||||
public final class BitUtil {
|
||||
|
||||
private static final byte[] BYTE_COUNTS = { // table of bits/byte
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
|
||||
};
|
||||
|
||||
// The General Idea: instead of having an array per byte that has
|
||||
// the offsets of the next set bit, that array could be
|
||||
// packed inside a 32 bit integer (8 4 bit numbers). That
|
||||
// should be faster than accessing an array for each index, and
|
||||
// the total array size is kept smaller (256*sizeof(int))=1K
|
||||
/***** the python code that generated bitlist
|
||||
def bits2int(val):
|
||||
arr=0
|
||||
for shift in range(8,0,-1):
|
||||
if val & 0x80:
|
||||
arr = (arr << 4) | shift
|
||||
val = val << 1
|
||||
return arr
|
||||
|
||||
def int_table():
|
||||
tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ]
|
||||
return ','.join(tbl)
|
||||
******/
|
||||
private static final int[] BIT_LISTS = {
|
||||
0x0, 0x1, 0x2, 0x21, 0x3, 0x31, 0x32, 0x321, 0x4, 0x41, 0x42, 0x421, 0x43,
|
||||
0x431, 0x432, 0x4321, 0x5, 0x51, 0x52, 0x521, 0x53, 0x531, 0x532, 0x5321,
|
||||
0x54, 0x541, 0x542, 0x5421, 0x543, 0x5431, 0x5432, 0x54321, 0x6, 0x61, 0x62,
|
||||
0x621, 0x63, 0x631, 0x632, 0x6321, 0x64, 0x641, 0x642, 0x6421, 0x643,
|
||||
0x6431, 0x6432, 0x64321, 0x65, 0x651, 0x652, 0x6521, 0x653, 0x6531, 0x6532,
|
||||
0x65321, 0x654, 0x6541, 0x6542, 0x65421, 0x6543, 0x65431, 0x65432, 0x654321,
|
||||
0x7, 0x71, 0x72, 0x721, 0x73, 0x731, 0x732, 0x7321, 0x74, 0x741, 0x742,
|
||||
0x7421, 0x743, 0x7431, 0x7432, 0x74321, 0x75, 0x751, 0x752, 0x7521, 0x753,
|
||||
0x7531, 0x7532, 0x75321, 0x754, 0x7541, 0x7542, 0x75421, 0x7543, 0x75431,
|
||||
0x75432, 0x754321, 0x76, 0x761, 0x762, 0x7621, 0x763, 0x7631, 0x7632,
|
||||
0x76321, 0x764, 0x7641, 0x7642, 0x76421, 0x7643, 0x76431, 0x76432, 0x764321,
|
||||
0x765, 0x7651, 0x7652, 0x76521, 0x7653, 0x76531, 0x76532, 0x765321, 0x7654,
|
||||
0x76541, 0x76542, 0x765421, 0x76543, 0x765431, 0x765432, 0x7654321, 0x8,
|
||||
0x81, 0x82, 0x821, 0x83, 0x831, 0x832, 0x8321, 0x84, 0x841, 0x842, 0x8421,
|
||||
0x843, 0x8431, 0x8432, 0x84321, 0x85, 0x851, 0x852, 0x8521, 0x853, 0x8531,
|
||||
0x8532, 0x85321, 0x854, 0x8541, 0x8542, 0x85421, 0x8543, 0x85431, 0x85432,
|
||||
0x854321, 0x86, 0x861, 0x862, 0x8621, 0x863, 0x8631, 0x8632, 0x86321, 0x864,
|
||||
0x8641, 0x8642, 0x86421, 0x8643, 0x86431, 0x86432, 0x864321, 0x865, 0x8651,
|
||||
0x8652, 0x86521, 0x8653, 0x86531, 0x86532, 0x865321, 0x8654, 0x86541,
|
||||
0x86542, 0x865421, 0x86543, 0x865431, 0x865432, 0x8654321, 0x87, 0x871,
|
||||
0x872, 0x8721, 0x873, 0x8731, 0x8732, 0x87321, 0x874, 0x8741, 0x8742,
|
||||
0x87421, 0x8743, 0x87431, 0x87432, 0x874321, 0x875, 0x8751, 0x8752, 0x87521,
|
||||
0x8753, 0x87531, 0x87532, 0x875321, 0x8754, 0x87541, 0x87542, 0x875421,
|
||||
0x87543, 0x875431, 0x875432, 0x8754321, 0x876, 0x8761, 0x8762, 0x87621,
|
||||
0x8763, 0x87631, 0x87632, 0x876321, 0x8764, 0x87641, 0x87642, 0x876421,
|
||||
0x87643, 0x876431, 0x876432, 0x8764321, 0x8765, 0x87651, 0x87652, 0x876521,
|
||||
0x87653, 0x876531, 0x876532, 0x8765321, 0x87654, 0x876541, 0x876542,
|
||||
0x8765421, 0x876543, 0x8765431, 0x8765432, 0x87654321
|
||||
};
|
||||
|
||||
private BitUtil() {} // no instance
|
||||
|
||||
/** Return the number of bits sets in b. */
|
||||
public static int bitCount(byte b) {
|
||||
return BYTE_COUNTS[b & 0xFF];
|
||||
}
|
||||
|
||||
/** Return the list of bits which are set in b encoded as followed:
|
||||
* <code>(i >>> (4 * n)) & 0x0F</code> is the offset of the n-th set bit of
|
||||
* the given byte plus one, or 0 if there are n or less bits set in the given
|
||||
* byte. For example <code>bitList(12)</code> returns 0x43:<ul>
|
||||
* <li><code>0x43 & 0x0F</code> is 3, meaning the the first bit set is at offset 3-1 = 2,</li>
|
||||
* <li><code>(0x43 >>> 4) & 0x0F</code> is 4, meaning there is a second bit set at offset 4-1=3,</li>
|
||||
* <li><code>(0x43 >>> 8) & 0x0F</code> is 0, meaning there is no more bit set in this byte.</li>
|
||||
* </ul>*/
|
||||
public static int bitList(byte b) {
|
||||
return BIT_LISTS[b & 0xFF];
|
||||
}
|
||||
|
||||
// The pop methods used to rely on bit-manipulation tricks for speed but it
|
||||
// turns out that it is faster to use the Long.bitCount method (which is an
|
||||
// intrinsic since Java 6u18) in a naive loop, see LUCENE-2221
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
package org.apache.lucene.util;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -17,26 +17,27 @@ package org.apache.lucene.codecs.compressing;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
/**
|
||||
* A {@link DataOutput} that can be used to build a byte[].
|
||||
* @lucene.internal
|
||||
*/
|
||||
final class GrowableByteArrayDataOutput extends DataOutput {
|
||||
public final class GrowableByteArrayDataOutput extends DataOutput {
|
||||
|
||||
byte[] bytes;
|
||||
int length;
|
||||
/** The bytes */
|
||||
public byte[] bytes;
|
||||
/** The length */
|
||||
public int length;
|
||||
|
||||
GrowableByteArrayDataOutput(int cp) {
|
||||
/** Create a {@link GrowableByteArrayDataOutput} with the given initial capacity. */
|
||||
public GrowableByteArrayDataOutput(int cp) {
|
||||
this.bytes = new byte[ArrayUtil.oversize(cp, 1)];
|
||||
this.length = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeByte(byte b) throws IOException {
|
||||
public void writeByte(byte b) {
|
||||
if (length >= bytes.length) {
|
||||
bytes = ArrayUtil.grow(bytes);
|
||||
}
|
||||
|
@ -44,7 +45,7 @@ final class GrowableByteArrayDataOutput extends DataOutput {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void writeBytes(byte[] b, int off, int len) throws IOException {
|
||||
public void writeBytes(byte[] b, int off, int len) {
|
||||
final int newLength = length + len;
|
||||
if (newLength > bytes.length) {
|
||||
bytes = ArrayUtil.grow(bytes, newLength);
|
|
@ -25,55 +25,6 @@ import org.apache.lucene.search.DocIdSetIterator;
|
|||
*/
|
||||
public class OpenBitSetIterator extends DocIdSetIterator {
|
||||
|
||||
// The General Idea: instead of having an array per byte that has
|
||||
// the offsets of the next set bit, that array could be
|
||||
// packed inside a 32 bit integer (8 4 bit numbers). That
|
||||
// should be faster than accessing an array for each index, and
|
||||
// the total array size is kept smaller (256*sizeof(int))=1K
|
||||
protected final static int[] bitlist={
|
||||
0x0, 0x1, 0x2, 0x21, 0x3, 0x31, 0x32, 0x321, 0x4, 0x41, 0x42, 0x421, 0x43,
|
||||
0x431, 0x432, 0x4321, 0x5, 0x51, 0x52, 0x521, 0x53, 0x531, 0x532, 0x5321,
|
||||
0x54, 0x541, 0x542, 0x5421, 0x543, 0x5431, 0x5432, 0x54321, 0x6, 0x61, 0x62,
|
||||
0x621, 0x63, 0x631, 0x632, 0x6321, 0x64, 0x641, 0x642, 0x6421, 0x643,
|
||||
0x6431, 0x6432, 0x64321, 0x65, 0x651, 0x652, 0x6521, 0x653, 0x6531, 0x6532,
|
||||
0x65321, 0x654, 0x6541, 0x6542, 0x65421, 0x6543, 0x65431, 0x65432, 0x654321,
|
||||
0x7, 0x71, 0x72, 0x721, 0x73, 0x731, 0x732, 0x7321, 0x74, 0x741, 0x742,
|
||||
0x7421, 0x743, 0x7431, 0x7432, 0x74321, 0x75, 0x751, 0x752, 0x7521, 0x753,
|
||||
0x7531, 0x7532, 0x75321, 0x754, 0x7541, 0x7542, 0x75421, 0x7543, 0x75431,
|
||||
0x75432, 0x754321, 0x76, 0x761, 0x762, 0x7621, 0x763, 0x7631, 0x7632,
|
||||
0x76321, 0x764, 0x7641, 0x7642, 0x76421, 0x7643, 0x76431, 0x76432, 0x764321,
|
||||
0x765, 0x7651, 0x7652, 0x76521, 0x7653, 0x76531, 0x76532, 0x765321, 0x7654,
|
||||
0x76541, 0x76542, 0x765421, 0x76543, 0x765431, 0x765432, 0x7654321, 0x8,
|
||||
0x81, 0x82, 0x821, 0x83, 0x831, 0x832, 0x8321, 0x84, 0x841, 0x842, 0x8421,
|
||||
0x843, 0x8431, 0x8432, 0x84321, 0x85, 0x851, 0x852, 0x8521, 0x853, 0x8531,
|
||||
0x8532, 0x85321, 0x854, 0x8541, 0x8542, 0x85421, 0x8543, 0x85431, 0x85432,
|
||||
0x854321, 0x86, 0x861, 0x862, 0x8621, 0x863, 0x8631, 0x8632, 0x86321, 0x864,
|
||||
0x8641, 0x8642, 0x86421, 0x8643, 0x86431, 0x86432, 0x864321, 0x865, 0x8651,
|
||||
0x8652, 0x86521, 0x8653, 0x86531, 0x86532, 0x865321, 0x8654, 0x86541,
|
||||
0x86542, 0x865421, 0x86543, 0x865431, 0x865432, 0x8654321, 0x87, 0x871,
|
||||
0x872, 0x8721, 0x873, 0x8731, 0x8732, 0x87321, 0x874, 0x8741, 0x8742,
|
||||
0x87421, 0x8743, 0x87431, 0x87432, 0x874321, 0x875, 0x8751, 0x8752, 0x87521,
|
||||
0x8753, 0x87531, 0x87532, 0x875321, 0x8754, 0x87541, 0x87542, 0x875421,
|
||||
0x87543, 0x875431, 0x875432, 0x8754321, 0x876, 0x8761, 0x8762, 0x87621,
|
||||
0x8763, 0x87631, 0x87632, 0x876321, 0x8764, 0x87641, 0x87642, 0x876421,
|
||||
0x87643, 0x876431, 0x876432, 0x8764321, 0x8765, 0x87651, 0x87652, 0x876521,
|
||||
0x87653, 0x876531, 0x876532, 0x8765321, 0x87654, 0x876541, 0x876542,
|
||||
0x8765421, 0x876543, 0x8765431, 0x8765432, 0x87654321
|
||||
};
|
||||
/***** the python code that generated bitlist
|
||||
def bits2int(val):
|
||||
arr=0
|
||||
for shift in range(8,0,-1):
|
||||
if val & 0x80:
|
||||
arr = (arr << 4) | shift
|
||||
val = val << 1
|
||||
return arr
|
||||
|
||||
def int_table():
|
||||
tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ]
|
||||
return ','.join(tbl)
|
||||
******/
|
||||
|
||||
// hmmm, what about an iterator that finds zeros though,
|
||||
// or a reverse iterator... should they be separate classes
|
||||
// for efficiency, or have a common root interface? (or
|
||||
|
@ -101,7 +52,7 @@ public class OpenBitSetIterator extends DocIdSetIterator {
|
|||
if ((int)word ==0) {wordShift +=32; word = word >>>32; }
|
||||
if ((word & 0x0000FFFF) == 0) { wordShift +=16; word >>>=16; }
|
||||
if ((word & 0x000000FF) == 0) { wordShift +=8; word >>>=8; }
|
||||
indexArray = bitlist[(int)word & 0xff];
|
||||
indexArray = BitUtil.bitList((byte) word);
|
||||
}
|
||||
|
||||
/***** alternate shift implementations
|
||||
|
|
|
@ -0,0 +1,625 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* {@link DocIdSet} implementation based on word-aligned hybrid encoding on
|
||||
* words of 8 bits.
|
||||
* <p>This implementation doesn't support random-access but has a fast
|
||||
* {@link DocIdSetIterator} which can advance in logarithmic time thanks to
|
||||
* an index.</p>
|
||||
* <p>The compression scheme is simplistic and should work well with sparse doc
|
||||
* id sets while being only slightly larger than a {@link FixedBitSet} for
|
||||
* incompressible sets (overhead<2% in the worst case) in spite of the index.</p>
|
||||
* <p><b>Format</b>: The format is byte-aligned. An 8-bits word is either clean,
|
||||
* meaning composed only of zeros, or dirty, meaning that it contains at least one
|
||||
* bit set. The idea is to encode sequences of clean words using run-length
|
||||
* encoding and to leave sequences of dirty words as-is.</p>
|
||||
* <table>
|
||||
* <tr><th>Token</th><th>Clean length+</th><th>Dirty length+</th><th>Dirty words</th></tr>
|
||||
* <tr><td>1 byte</td><td>0-n bytes</td><td>0-n bytes</td><td>0-n bytes</td></tr>
|
||||
* </table>
|
||||
* <ul>
|
||||
* <li><b>Token</b> encodes the number of clean words minus 2 on the first 4
|
||||
* bits and the number of dirty words minus 1 on the last 4 bits. The
|
||||
* higher-order bit is a continuation bit, meaning that the number is incomplete
|
||||
* and needs additional bytes to be read.</li>
|
||||
* <li><b>Clean length+</b>: If clean length has its higher-order bit set,
|
||||
* you need to read a {@link DataInput#readVInt() vint}, shift it by 3 bits on
|
||||
* the left side and add it to the 3 bits which have been read in the token.</li>
|
||||
* <li><b>Dirty length+</b> works the same way as <b>Clean length+</b> but
|
||||
* for the length of dirty words.</li>
|
||||
* <li><b>Dirty words</b> are the dirty words, there are <b>Dirty length</b>
|
||||
* of them.</li>
|
||||
* </ul>
|
||||
* <p>This format cannot encode sequences of less than 2 clean words and 1 dirty
|
||||
* word. The reason is that if you find a single clean word, you should rather
|
||||
* encode it as a dirty word. This takes the same space as starting a new
|
||||
* sequence (since you need one byte for the token) but will be lighter to
|
||||
* decode. There is however an exception for the first sequence. Since the first
|
||||
* sequence may start directly with a dirty word, the clean length is encoded
|
||||
* directly, without subtracting 2.</p>
|
||||
* <p>There is an additional restriction on the format: the sequence of dirty
|
||||
* words must start and end with a non-null word and is not allowed to contain
|
||||
* two consecutive null words. This restriction exists to make sure no space is
|
||||
* wasted and to make sure iterators can read the next doc ID by reading at most
|
||||
* 2 dirty words.</p>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class WAH8DocIdSet extends DocIdSet {
|
||||
|
||||
// Minimum index interval, intervals below this value can't guarantee anymore
|
||||
// that this set implementation won't be significantly larger than a FixedBitSet
|
||||
// The reason is that a single sequence saves at least one byte and an index
|
||||
// entry requires at most 8 bytes (2 ints) so there shouldn't be more than one
|
||||
// index entry every 8 sequences
|
||||
private static final int MIN_INDEX_INTERVAL = 8;
|
||||
|
||||
/** Default index interval. */
|
||||
// To compute this default value, I created a rather dense set (0.1% load
|
||||
// factor, which is close to the worst case regarding both compression and
|
||||
// speed for this DocIdSet impl since sequences are going to be short) and I
|
||||
// started with interval=1 and doubled it at each iteration until advance
|
||||
// became slower
|
||||
public static final int DEFAULT_INDEX_INTERVAL = 16;
|
||||
|
||||
private static final PackedInts.Reader EMPTY_READER = new PackedInts.NullReader(1);
|
||||
private static WAH8DocIdSet EMPTY = new WAH8DocIdSet(new byte[0], EMPTY_READER, EMPTY_READER);
|
||||
|
||||
private static final Comparator<Iterator> SERIALIZED_LENGTH_COMPARATOR = new Comparator<Iterator>() {
|
||||
@Override
|
||||
public int compare(Iterator wi1, Iterator wi2) {
|
||||
return wi1.in.length() - wi2.in.length();
|
||||
}
|
||||
};
|
||||
|
||||
/** Same as {@link #copyOf(DocIdSetIterator, int)} with the default index interval. */
|
||||
public static WAH8DocIdSet copyOf(DocIdSetIterator it) throws IOException {
|
||||
return copyOf(it, DEFAULT_INDEX_INTERVAL);
|
||||
}
|
||||
|
||||
/** Return a copy of the provided iterator. */
|
||||
public static WAH8DocIdSet copyOf(DocIdSetIterator it, int indexInterval) throws IOException {
|
||||
Builder builder = new Builder().setIndexInterval(indexInterval);
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
builder.add(doc);
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
/** Same as {@link #intersect(Collection, int)} with the default index interval. */
|
||||
public static WAH8DocIdSet intersect(Collection<WAH8DocIdSet> docIdSets) {
|
||||
return intersect(docIdSets, DEFAULT_INDEX_INTERVAL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the intersection of the provided sets. This method is much faster than
|
||||
* computing the intersection manually since it operates directly at the byte level.
|
||||
*/
|
||||
public static WAH8DocIdSet intersect(Collection<WAH8DocIdSet> docIdSets, int indexInterval) {
|
||||
switch (docIdSets.size()) {
|
||||
case 0:
|
||||
throw new IllegalArgumentException("There must be at least one set to intersect");
|
||||
case 1:
|
||||
return docIdSets.iterator().next();
|
||||
}
|
||||
// The logic below is similar to ConjunctionScorer
|
||||
final int numSets = docIdSets.size();
|
||||
final Iterator[] iterators = new Iterator[numSets];
|
||||
int i = 0;
|
||||
for (WAH8DocIdSet set : docIdSets) {
|
||||
final Iterator it = set.iterator();
|
||||
iterators[i++] = it;
|
||||
}
|
||||
Arrays.sort(iterators, SERIALIZED_LENGTH_COMPARATOR);
|
||||
final WordBuilder builder = new WordBuilder().setIndexInterval(indexInterval);
|
||||
int wordNum = 0;
|
||||
main:
|
||||
while (true) {
|
||||
// Advance the least costly iterator first
|
||||
iterators[0].advanceWord(wordNum);
|
||||
wordNum = iterators[0].wordNum;
|
||||
if (wordNum == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
break;
|
||||
}
|
||||
byte word = iterators[0].word;
|
||||
for (i = 1; i < numSets; ++i) {
|
||||
if (iterators[i].wordNum < wordNum) {
|
||||
iterators[i].advanceWord(wordNum);
|
||||
}
|
||||
if (iterators[i].wordNum > wordNum) {
|
||||
wordNum = iterators[i].wordNum;
|
||||
continue main;
|
||||
}
|
||||
assert iterators[i].wordNum == wordNum;
|
||||
word &= iterators[i].word;
|
||||
if (word == 0) {
|
||||
// There are common words, but they don't share any bit
|
||||
++wordNum;
|
||||
continue main;
|
||||
}
|
||||
}
|
||||
// Found a common word
|
||||
assert word != 0;
|
||||
builder.addWord(wordNum, word);
|
||||
++wordNum;
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
/** Same as {@link #union(Collection, int)} with the default index interval. */
|
||||
public static WAH8DocIdSet union(Collection<WAH8DocIdSet> docIdSets) {
|
||||
return union(docIdSets, DEFAULT_INDEX_INTERVAL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the union of the provided sets. This method is much faster than
|
||||
* computing the union manually since it operates directly at the byte level.
|
||||
*/
|
||||
public static WAH8DocIdSet union(Collection<WAH8DocIdSet> docIdSets, int indexInterval) {
|
||||
switch (docIdSets.size()) {
|
||||
case 0:
|
||||
return EMPTY;
|
||||
case 1:
|
||||
return docIdSets.iterator().next();
|
||||
}
|
||||
// The logic below is very similar to DisjunctionScorer
|
||||
final int numSets = docIdSets.size();
|
||||
final PriorityQueue<Iterator> iterators = new PriorityQueue<WAH8DocIdSet.Iterator>(numSets) {
|
||||
@Override
|
||||
protected boolean lessThan(Iterator a, Iterator b) {
|
||||
return a.wordNum < b.wordNum;
|
||||
}
|
||||
};
|
||||
for (WAH8DocIdSet set : docIdSets) {
|
||||
Iterator iterator = set.iterator();
|
||||
iterator.nextWord();
|
||||
iterators.add(iterator);
|
||||
}
|
||||
|
||||
Iterator top = iterators.top();
|
||||
if (top.wordNum == Integer.MAX_VALUE) {
|
||||
return EMPTY;
|
||||
}
|
||||
int wordNum = top.wordNum;
|
||||
byte word = top.word;
|
||||
final WordBuilder builder = new WordBuilder().setIndexInterval(indexInterval);
|
||||
while (true) {
|
||||
top.nextWord();
|
||||
iterators.updateTop();
|
||||
top = iterators.top();
|
||||
if (top.wordNum == wordNum) {
|
||||
word |= top.word;
|
||||
} else {
|
||||
builder.addWord(wordNum, word);
|
||||
if (top.wordNum == Integer.MAX_VALUE) {
|
||||
break;
|
||||
}
|
||||
wordNum = top.wordNum;
|
||||
word = top.word;
|
||||
}
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
static int wordNum(int docID) {
|
||||
assert docID >= 0;
|
||||
return docID >>> 3;
|
||||
}
|
||||
|
||||
/** Word-based builder. */
|
||||
static class WordBuilder {
|
||||
|
||||
final GrowableByteArrayDataOutput out;
|
||||
final GrowableByteArrayDataOutput dirtyWords;
|
||||
int clean;
|
||||
int lastWordNum;
|
||||
int numSequences;
|
||||
int indexInterval;
|
||||
|
||||
WordBuilder() {
|
||||
out = new GrowableByteArrayDataOutput(1024);
|
||||
dirtyWords = new GrowableByteArrayDataOutput(128);
|
||||
clean = 0;
|
||||
lastWordNum = -1;
|
||||
numSequences = 0;
|
||||
indexInterval = DEFAULT_INDEX_INTERVAL;
|
||||
}
|
||||
|
||||
/** Set the index interval. Smaller index intervals improve performance of
|
||||
* {@link DocIdSetIterator#advance(int)} but make the {@link DocIdSet}
|
||||
* larger. An index interval <code>i</code> makes the index add an overhead
|
||||
* which is at most <code>4/i</code>, but likely much less.The default index
|
||||
* interval is <code>16</code>, meaning the index has an overhead of at most
|
||||
* 25%. To disable indexing, you can pass {@link Integer#MAX_VALUE} as an
|
||||
* index interval. */
|
||||
public WordBuilder setIndexInterval(int indexInterval) {
|
||||
if (indexInterval < MIN_INDEX_INTERVAL) {
|
||||
throw new IllegalArgumentException("indexInterval must be >= " + MIN_INDEX_INTERVAL);
|
||||
}
|
||||
this.indexInterval = indexInterval;
|
||||
return this;
|
||||
}
|
||||
|
||||
void writeHeader(int cleanLength) throws IOException {
|
||||
final int cleanLengthMinus2 = cleanLength - 2;
|
||||
final int dirtyLengthMinus1 = dirtyWords.length - 1;
|
||||
assert cleanLengthMinus2 >= 0;
|
||||
assert dirtyLengthMinus1 >= 0;
|
||||
int token = ((cleanLengthMinus2 & 0x07) << 4) | (dirtyLengthMinus1 & 0x07);
|
||||
if (cleanLengthMinus2 > 0x07) {
|
||||
token |= 1 << 7;
|
||||
}
|
||||
if (dirtyLengthMinus1 > 0x07) {
|
||||
token |= 1 << 3;
|
||||
}
|
||||
out.writeByte((byte) token);
|
||||
if (cleanLengthMinus2 > 0x07) {
|
||||
out.writeVInt(cleanLengthMinus2 >>> 3);
|
||||
}
|
||||
if (dirtyLengthMinus1 > 0x07) {
|
||||
out.writeVInt(dirtyLengthMinus1 >>> 3);
|
||||
}
|
||||
}
|
||||
|
||||
void writeSequence(int cleanLength) {
|
||||
try {
|
||||
writeHeader(cleanLength);
|
||||
out.writeBytes(dirtyWords.bytes, dirtyWords.length);
|
||||
} catch (IOException cannotHappen) {
|
||||
throw new AssertionError(cannotHappen);
|
||||
}
|
||||
dirtyWords.length = 0;
|
||||
++numSequences;
|
||||
}
|
||||
|
||||
void addWord(int wordNum, byte word) {
|
||||
assert wordNum > lastWordNum;
|
||||
assert word != 0;
|
||||
|
||||
if (lastWordNum == -1) {
|
||||
clean = 2 + wordNum; // special case for the 1st sequence
|
||||
dirtyWords.writeByte(word);
|
||||
} else {
|
||||
switch (wordNum - lastWordNum) {
|
||||
case 1:
|
||||
dirtyWords.writeByte(word);
|
||||
break;
|
||||
case 2:
|
||||
dirtyWords.writeByte((byte) 0);
|
||||
dirtyWords.writeByte(word);
|
||||
break;
|
||||
default:
|
||||
writeSequence(clean);
|
||||
clean = wordNum - lastWordNum - 1;
|
||||
dirtyWords.writeByte(word);
|
||||
}
|
||||
}
|
||||
lastWordNum = wordNum;
|
||||
}
|
||||
|
||||
/** Build a new {@link WAH8DocIdSet}. */
|
||||
public WAH8DocIdSet build() {
|
||||
if (lastWordNum == -1) {
|
||||
return EMPTY;
|
||||
}
|
||||
writeSequence(clean);
|
||||
final byte[] data = Arrays.copyOf(out.bytes, out.length);
|
||||
|
||||
// Now build the index
|
||||
final int valueCount = (numSequences - 1) / indexInterval + 1;
|
||||
final PackedInts.Reader indexPositions;
|
||||
final PackedInts.Reader indexWordNums;
|
||||
if (valueCount <= 1) {
|
||||
indexPositions = indexWordNums = EMPTY_READER;
|
||||
} else {
|
||||
// From the tests I ran, there is no need to expose acceptableOverheadRatio, these packed ints are never the bottleneck
|
||||
final PackedInts.Mutable positions = PackedInts.getMutable(valueCount, PackedInts.bitsRequired(data.length - 1), PackedInts.COMPACT);
|
||||
final PackedInts.Mutable wordNums = PackedInts.getMutable(valueCount, PackedInts.bitsRequired(lastWordNum), PackedInts.COMPACT);
|
||||
|
||||
final Iterator it = new Iterator(data, null, null);
|
||||
assert it.in.getPosition() == 0;
|
||||
assert it.wordNum == -1;
|
||||
for (int i = 1; i < valueCount; ++i) {
|
||||
// skip indexInterval sequences
|
||||
for (int j = 0; j < indexInterval; ++j) {
|
||||
final boolean readSequence = it.readSequence();
|
||||
assert readSequence;
|
||||
it.skipDirtyBytes();
|
||||
}
|
||||
final int position = it.in.getPosition();
|
||||
final int wordNum = it.wordNum;
|
||||
positions.set(i, position);
|
||||
wordNums.set(i, wordNum + 1);
|
||||
}
|
||||
indexPositions = positions;
|
||||
indexWordNums = wordNums;
|
||||
}
|
||||
|
||||
return new WAH8DocIdSet(data, indexPositions, indexWordNums);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** A builder for {@link WAH8DocIdSet}s. */
|
||||
public static final class Builder extends WordBuilder {
|
||||
|
||||
private int lastDocID;
|
||||
private int wordNum, word;
|
||||
|
||||
/** Sole constructor */
|
||||
public Builder() {
|
||||
super();
|
||||
lastDocID = -1;
|
||||
wordNum = -1;
|
||||
word = 0;
|
||||
}
|
||||
|
||||
/** Add a document to this builder. Documents must be added in order. */
|
||||
public Builder add(int docID) {
|
||||
if (docID <= lastDocID) {
|
||||
throw new IllegalArgumentException("Doc ids must be added in-order, got " + docID + " which is <= lastDocID=" + lastDocID);
|
||||
}
|
||||
final int wordNum = wordNum(docID);
|
||||
if (this.wordNum == -1) {
|
||||
this.wordNum = wordNum;
|
||||
word = 1 << (docID & 0x07);
|
||||
} else if (wordNum == this.wordNum) {
|
||||
word |= 1 << (docID & 0x07);
|
||||
} else {
|
||||
addWord(this.wordNum, (byte) word);
|
||||
this.wordNum = wordNum;
|
||||
word = 1 << (docID & 0x07);
|
||||
}
|
||||
lastDocID = docID;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Builder setIndexInterval(int indexInterval) {
|
||||
return (Builder) super.setIndexInterval(indexInterval);
|
||||
}
|
||||
|
||||
@Override
|
||||
public WAH8DocIdSet build() {
|
||||
if (this.wordNum != -1) {
|
||||
addWord(wordNum, (byte) word);
|
||||
}
|
||||
return super.build();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// where the doc IDs are stored
|
||||
private final byte[] data;
|
||||
// index for advance(int)
|
||||
private final PackedInts.Reader positions, wordNums; // wordNums[i] starts at the sequence at positions[i]
|
||||
|
||||
WAH8DocIdSet(byte[] data, PackedInts.Reader positions, PackedInts.Reader wordNums) {
|
||||
this.data = data;
|
||||
this.positions = positions;
|
||||
this.wordNums = wordNums;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCacheable() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator iterator() {
|
||||
return new Iterator(data, positions, wordNums);
|
||||
}
|
||||
|
||||
static int readLength(ByteArrayDataInput in, int len) {
|
||||
if ((len & 0x08) == 0) {
|
||||
// no continuation bit
|
||||
return len;
|
||||
}
|
||||
return (len & 0x07) | (in.readVInt() << 3);
|
||||
}
|
||||
|
||||
static class Iterator extends DocIdSetIterator {
|
||||
|
||||
final ByteArrayDataInput in;
|
||||
final PackedInts.Reader positions, wordNums;
|
||||
int dirtyLength;
|
||||
|
||||
int wordNum; // byte offset
|
||||
byte word; // current word
|
||||
int bitList; // list of bits set in the current word
|
||||
|
||||
int docID;
|
||||
|
||||
Iterator(byte[] data, PackedInts.Reader positions, PackedInts.Reader wordNums) {
|
||||
this.in = new ByteArrayDataInput(data);
|
||||
this.positions = positions;
|
||||
this.wordNums = wordNums;
|
||||
wordNum = -1;
|
||||
word = 0;
|
||||
bitList = 0;
|
||||
docID = -1;
|
||||
}
|
||||
|
||||
boolean readSequence() {
|
||||
if (in.eof()) {
|
||||
wordNum = Integer.MAX_VALUE;
|
||||
return false;
|
||||
}
|
||||
final int token = in.readByte() & 0xFF;
|
||||
final int cleanLength = (in.getPosition() == 1 ? 0 : 2) + readLength(in, token >>> 4);
|
||||
wordNum += cleanLength;
|
||||
dirtyLength = 1 + readLength(in, token & 0x0F);
|
||||
return true;
|
||||
}
|
||||
|
||||
void skipDirtyBytes(int count) {
|
||||
assert count >= 0;
|
||||
assert count <= dirtyLength;
|
||||
in.skipBytes(count);
|
||||
wordNum += count;
|
||||
dirtyLength -= count;
|
||||
}
|
||||
|
||||
void skipDirtyBytes() {
|
||||
in.skipBytes(dirtyLength);
|
||||
wordNum += dirtyLength;
|
||||
dirtyLength = 0;
|
||||
}
|
||||
|
||||
void nextWord() {
|
||||
if (dirtyLength == 0 && !readSequence()) {
|
||||
return;
|
||||
}
|
||||
word = in.readByte();
|
||||
if (word == 0) {
|
||||
word = in.readByte();
|
||||
assert word != 0; // there can never be two consecutive null dirty words
|
||||
++wordNum;
|
||||
--dirtyLength;
|
||||
}
|
||||
++wordNum;
|
||||
--dirtyLength;
|
||||
}
|
||||
|
||||
int binarySearch(int targetWordNum) {
|
||||
int lo = 0, hi = positions.size() - 1;
|
||||
while (lo <= hi) {
|
||||
final int mid = (lo + hi) >>> 1;
|
||||
final int midWordNum = (int) wordNums.get(mid);
|
||||
if (midWordNum <= targetWordNum) {
|
||||
lo = mid + 1;
|
||||
} else {
|
||||
hi = mid - 1;
|
||||
}
|
||||
}
|
||||
assert wordNums.get(hi) <= targetWordNum;
|
||||
assert hi+1 == wordNums.size() || wordNums.get(hi + 1) > targetWordNum;
|
||||
return hi;
|
||||
}
|
||||
|
||||
void advanceWord(int targetWordNum) {
|
||||
assert targetWordNum > wordNum;
|
||||
if (dirtyLength == 0 && !readSequence()) {
|
||||
return;
|
||||
}
|
||||
int delta = targetWordNum - wordNum;
|
||||
if (delta <= dirtyLength + 1) {
|
||||
if (delta > 1) {
|
||||
skipDirtyBytes(delta - 1);
|
||||
}
|
||||
} else {
|
||||
skipDirtyBytes();
|
||||
assert dirtyLength == 0;
|
||||
// use the index
|
||||
final int i = binarySearch(targetWordNum);
|
||||
final int position = (int) positions.get(i);
|
||||
if (position > in.getPosition()) { // if the binary search returned a backward offset, don't move
|
||||
wordNum = (int) wordNums.get(i) - 1;
|
||||
in.setPosition(position);
|
||||
}
|
||||
|
||||
while (true) {
|
||||
if (!readSequence()) {
|
||||
return;
|
||||
}
|
||||
delta = targetWordNum - wordNum;
|
||||
if (delta <= dirtyLength + 1) {
|
||||
if (delta > 1) {
|
||||
skipDirtyBytes(delta - 1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
skipDirtyBytes();
|
||||
}
|
||||
}
|
||||
|
||||
nextWord();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return docID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
if (bitList != 0) { // there are remaining bits in the current word
|
||||
docID = (wordNum << 3) | ((bitList & 0x0F) - 1);
|
||||
bitList >>>= 4;
|
||||
return docID;
|
||||
}
|
||||
nextWord();
|
||||
if (wordNum == Integer.MAX_VALUE) {
|
||||
return docID = NO_MORE_DOCS;
|
||||
}
|
||||
bitList = BitUtil.bitList(word);
|
||||
assert bitList != 0;
|
||||
docID = (wordNum << 3) | ((bitList & 0x0F) - 1);
|
||||
bitList >>>= 4;
|
||||
return docID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
assert target > docID;
|
||||
final int targetWordNum = wordNum(target);
|
||||
if (targetWordNum > wordNum) {
|
||||
advanceWord(targetWordNum);
|
||||
bitList = BitUtil.bitList(word);
|
||||
}
|
||||
return slowAdvance(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return in.length(); // good estimation of the cost of iterating over all docs
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** Return the number of documents in this {@link DocIdSet}. This method
|
||||
* runs in linear time but is much faster than counting documents. */
|
||||
public int cardinality() {
|
||||
int cardinality = 0;
|
||||
for (Iterator it = iterator(); it.wordNum != Integer.MAX_VALUE; it.nextWord()) {
|
||||
cardinality += BitUtil.bitCount(it.word);
|
||||
}
|
||||
return cardinality;
|
||||
}
|
||||
|
||||
/** Return the memory usage of this class in bytes. */
|
||||
public long ramBytesUsed() {
|
||||
return RamUsageEstimator.alignObjectSize(3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF)
|
||||
+ RamUsageEstimator.sizeOf(data)
|
||||
+ positions.ramBytesUsed()
|
||||
+ wordNums.ramBytesUsed();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,144 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
|
||||
public class TestWAH8DocIdSet extends LuceneTestCase {
|
||||
|
||||
private static FixedBitSet randomSet(int numBits, int numBitsSet) {
|
||||
assert numBitsSet <= numBits;
|
||||
final FixedBitSet set = new FixedBitSet(numBits);
|
||||
if (numBitsSet == numBits) {
|
||||
set.set(0, set.length());
|
||||
} else {
|
||||
for (int i = 0; i < numBitsSet; ++i) {
|
||||
while (true) {
|
||||
final int o = random().nextInt(numBits);
|
||||
if (!set.get(o)) {
|
||||
set.set(o);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
private static FixedBitSet randomSet(int numBits, float percentSet) {
|
||||
return randomSet(numBits, (int) (percentSet * numBits));
|
||||
}
|
||||
|
||||
public void testAgainstFixedBitSet() throws IOException {
|
||||
final int numBits = _TestUtil.nextInt(random(), 100, 1 << 20);
|
||||
for (float percentSet : new float[] {0f, 0.0001f, random().nextFloat() / 2, 0.9f, 1f}) {
|
||||
final FixedBitSet set = randomSet(numBits, percentSet);
|
||||
final WAH8DocIdSet copy = WAH8DocIdSet.copyOf(set.iterator());
|
||||
assertEquals(numBits, set, copy);
|
||||
}
|
||||
}
|
||||
|
||||
public void assertEquals(int numBits, FixedBitSet ds1, WAH8DocIdSet ds2) throws IOException {
|
||||
assertEquals(ds1.cardinality(), ds2.cardinality());
|
||||
|
||||
// nextDoc
|
||||
DocIdSetIterator it1 = ds1.iterator();
|
||||
DocIdSetIterator it2 = ds2.iterator();
|
||||
assertEquals(it1.docID(), it2.docID());
|
||||
for (int doc = it1.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it1.nextDoc()) {
|
||||
assertEquals(doc, it2.nextDoc());
|
||||
assertEquals(it1.docID(), it2.docID());
|
||||
}
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, it2.nextDoc());
|
||||
assertEquals(it1.docID(), it2.docID());
|
||||
|
||||
// nextDoc / advance
|
||||
it1 = ds1.iterator();
|
||||
it2 = ds2.iterator();
|
||||
for (int doc = -1; doc != DocIdSetIterator.NO_MORE_DOCS;) {
|
||||
if (random().nextBoolean()) {
|
||||
doc = it1.nextDoc();
|
||||
assertEquals(doc, it2.nextDoc());
|
||||
assertEquals(it1.docID(), it2.docID());
|
||||
} else {
|
||||
final int target = doc + 1 + random().nextInt(random().nextBoolean() ? 64 : numBits / 64);
|
||||
doc = it1.advance(target);
|
||||
assertEquals(doc, it2.advance(target));
|
||||
assertEquals(it1.docID(), it2.docID());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testUnion() throws IOException {
|
||||
final int numBits = _TestUtil.nextInt(random(), 100, 1 << 20);
|
||||
final int numDocIdSets = _TestUtil.nextInt(random(), 0, 4);
|
||||
final List<FixedBitSet> fixedSets = new ArrayList<FixedBitSet>(numDocIdSets);
|
||||
for (int i = 0; i < numDocIdSets; ++i) {
|
||||
fixedSets.add(randomSet(numBits, random().nextFloat() / 16));
|
||||
}
|
||||
final List<WAH8DocIdSet> compressedSets = new ArrayList<WAH8DocIdSet>(numDocIdSets);
|
||||
for (FixedBitSet set : fixedSets) {
|
||||
compressedSets.add(WAH8DocIdSet.copyOf(set.iterator()));
|
||||
}
|
||||
|
||||
final WAH8DocIdSet union = WAH8DocIdSet.union(compressedSets);
|
||||
final FixedBitSet expected = new FixedBitSet(numBits);
|
||||
for (DocIdSet set : fixedSets) {
|
||||
final DocIdSetIterator it = set.iterator();
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
expected.set(doc);
|
||||
}
|
||||
}
|
||||
assertEquals(numBits, expected, union);
|
||||
}
|
||||
|
||||
public void testIntersection() throws IOException {
|
||||
final int numBits = _TestUtil.nextInt(random(), 100, 1 << 20);
|
||||
final int numDocIdSets = _TestUtil.nextInt(random(), 1, 4);
|
||||
final List<FixedBitSet> fixedSets = new ArrayList<FixedBitSet>(numDocIdSets);
|
||||
for (int i = 0; i < numDocIdSets; ++i) {
|
||||
fixedSets.add(randomSet(numBits, random().nextFloat()));
|
||||
}
|
||||
final List<WAH8DocIdSet> compressedSets = new ArrayList<WAH8DocIdSet>(numDocIdSets);
|
||||
for (FixedBitSet set : fixedSets) {
|
||||
compressedSets.add(WAH8DocIdSet.copyOf(set.iterator()));
|
||||
}
|
||||
|
||||
final WAH8DocIdSet union = WAH8DocIdSet.intersect(compressedSets);
|
||||
final FixedBitSet expected = new FixedBitSet(numBits);
|
||||
expected.set(0, expected.length());
|
||||
for (DocIdSet set : fixedSets) {
|
||||
final DocIdSetIterator it = set.iterator();
|
||||
int lastDoc = -1;
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
expected.clear(lastDoc + 1, doc);
|
||||
lastDoc = doc;
|
||||
}
|
||||
if (lastDoc + 1 < expected.length()) {
|
||||
expected.clear(lastDoc + 1, expected.length());
|
||||
}
|
||||
}
|
||||
assertEquals(numBits, expected, union);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue