mirror of https://github.com/apache/lucene.git
LUCENE-3254: fix isSparse logic in BitVector; add version header to saved BitVector (*.del) files
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1140720 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
71d4dc370b
commit
5b56b5ee98
|
@ -489,6 +489,10 @@ Bug fixes
|
|||
* LUCENE-3251: Directory#copy failed to close target output if opening the
|
||||
source stream failed. (Simon Willnauer)
|
||||
|
||||
* LUCENE-3254: Fixed minor bug in deletes were written to disk,
|
||||
causing the file to sometimes be larger than it needed to be. (Mike
|
||||
McCandless)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-3201, LUCENE-3218: CompoundFileSystem code has been consolidated
|
||||
|
|
|
@ -24,13 +24,15 @@ import org.apache.lucene.store.IndexInput;
|
|||
import org.apache.lucene.store.IndexOutput;
|
||||
|
||||
/** Optimized implementation of a vector of bits. This is more-or-less like
|
||||
java.util.BitSet, but also includes the following:
|
||||
<ul>
|
||||
<li>a count() method, which efficiently computes the number of one bits;</li>
|
||||
<li>optimized read from and write to disk;</li>
|
||||
<li>inlinable get() method;</li>
|
||||
<li>store and load, as bit set or d-gaps, depending on sparseness;</li>
|
||||
</ul>
|
||||
* java.util.BitSet, but also includes the following:
|
||||
* <ul>
|
||||
* <li>a count() method, which efficiently computes the number of one bits;</li>
|
||||
* <li>optimized read from and write to disk;</li>
|
||||
* <li>inlinable get() method;</li>
|
||||
* <li>store and load, as bit set or d-gaps, depending on sparseness;</li>
|
||||
* </ul>
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class BitVector implements Cloneable, Bits {
|
||||
|
||||
|
@ -41,7 +43,7 @@ public final class BitVector implements Cloneable, Bits {
|
|||
/** Constructs a vector capable of holding <code>n</code> bits. */
|
||||
public BitVector(int n) {
|
||||
size = n;
|
||||
bits = new byte[(size >> 3) + 1];
|
||||
bits = new byte[getNumBytes(size)];
|
||||
count = 0;
|
||||
}
|
||||
|
||||
|
@ -51,6 +53,14 @@ public final class BitVector implements Cloneable, Bits {
|
|||
count = -1;
|
||||
}
|
||||
|
||||
private int getNumBytes(int size) {
|
||||
int bytesLength = size >>> 3;
|
||||
if ((size & 7) != 0) {
|
||||
bytesLength++;
|
||||
}
|
||||
return bytesLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
byte[] copyBits = new byte[bits.length];
|
||||
|
@ -158,6 +168,16 @@ public final class BitVector implements Cloneable, Bits {
|
|||
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
|
||||
};
|
||||
|
||||
private static String CODEC = "BitVector";
|
||||
|
||||
// Version before version tracking was added:
|
||||
private final static int VERSION_PRE = -1;
|
||||
|
||||
// First version:
|
||||
private final static int VERSION_START = 0;
|
||||
|
||||
// Increment version to change it:
|
||||
private final static int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
/** Writes this vector to the file <code>name</code> in Directory
|
||||
<code>d</code>, in a format that can be read by the constructor {@link
|
||||
|
@ -165,6 +185,8 @@ public final class BitVector implements Cloneable, Bits {
|
|||
public final void write(Directory d, String name) throws IOException {
|
||||
IndexOutput output = d.createOutput(name);
|
||||
try {
|
||||
output.writeInt(-2);
|
||||
CodecUtil.writeHeader(output, CODEC, VERSION_CURRENT);
|
||||
if (isSparse()) {
|
||||
writeDgaps(output); // sparse bit-set more efficiently saved as d-gaps.
|
||||
} else {
|
||||
|
@ -202,19 +224,38 @@ public final class BitVector implements Cloneable, Bits {
|
|||
|
||||
/** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
|
||||
private boolean isSparse() {
|
||||
// note: order of comparisons below set to favor smaller values (no binary range search.)
|
||||
// note: adding 4 because we start with ((int) -1) to indicate d-gaps format.
|
||||
// note: we write the d-gap for the byte number, and the byte (bits[i]) itself, therefore
|
||||
// multiplying count by (8+8) or (8+16) or (8+24) etc.:
|
||||
// - first 8 for writing bits[i] (1 byte vs. 1 bit), and
|
||||
// - second part for writing the byte-number d-gap as vint.
|
||||
|
||||
final int setCount = count();
|
||||
if (setCount == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
final int avgGapLength = bits.length / setCount;
|
||||
|
||||
// expected number of bytes for vInt encoding of each gap
|
||||
final int expectedDGapBytes;
|
||||
if (avgGapLength <= (1<< 7)) {
|
||||
expectedDGapBytes = 1;
|
||||
} else if (avgGapLength <= (1<<14)) {
|
||||
expectedDGapBytes = 2;
|
||||
} else if (avgGapLength <= (1<<21)) {
|
||||
expectedDGapBytes = 3;
|
||||
} else if (avgGapLength <= (1<<28)) {
|
||||
expectedDGapBytes = 4;
|
||||
} else {
|
||||
expectedDGapBytes = 5;
|
||||
}
|
||||
|
||||
// +1 because we write the byte itself that contains the
|
||||
// set bit
|
||||
final int bytesPerSetBit = expectedDGapBytes + 1;
|
||||
|
||||
// note: adding 32 because we start with ((int) -1) to indicate d-gaps format.
|
||||
final long expectedBits = 32 + 8 * bytesPerSetBit * count();
|
||||
|
||||
// note: factor is for read/write of byte-arrays being faster than vints.
|
||||
int factor = 10;
|
||||
if (bits.length < (1<< 7)) return factor * (4 + (8+ 8)*count()) < size();
|
||||
if (bits.length < (1<<14)) return factor * (4 + (8+16)*count()) < size();
|
||||
if (bits.length < (1<<21)) return factor * (4 + (8+24)*count()) < size();
|
||||
if (bits.length < (1<<28)) return factor * (4 + (8+32)*count()) < size();
|
||||
return factor * (4 + (8+40)*count()) < size();
|
||||
final long factor = 10;
|
||||
return factor * expectedBits < size();
|
||||
}
|
||||
|
||||
/** Constructs a bit vector from the file <code>name</code> in Directory
|
||||
|
@ -222,8 +263,18 @@ public final class BitVector implements Cloneable, Bits {
|
|||
*/
|
||||
public BitVector(Directory d, String name) throws IOException {
|
||||
IndexInput input = d.openInput(name);
|
||||
|
||||
try {
|
||||
size = input.readInt(); // read size
|
||||
final int firstInt = input.readInt();
|
||||
final int version;
|
||||
if (firstInt == -2) {
|
||||
// New format, with full header & version:
|
||||
version = CodecUtil.checkHeader(input, CODEC, VERSION_START, VERSION_START);
|
||||
size = input.readInt();
|
||||
} else {
|
||||
version = VERSION_PRE;
|
||||
size = firstInt;
|
||||
}
|
||||
if (size == -1) {
|
||||
readDgaps(input);
|
||||
} else {
|
||||
|
@ -237,7 +288,7 @@ public final class BitVector implements Cloneable, Bits {
|
|||
/** Read as a bit set */
|
||||
private void readBits(IndexInput input) throws IOException {
|
||||
count = input.readInt(); // read count
|
||||
bits = new byte[(size >> 3) + 1]; // allocate bits
|
||||
bits = new byte[getNumBytes(size)]; // allocate bits
|
||||
input.readBytes(bits, 0, bits.length);
|
||||
}
|
||||
|
||||
|
@ -254,30 +305,4 @@ public final class BitVector implements Cloneable, Bits {
|
|||
n -= BYTE_COUNTS[bits[last] & 0xFF];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve a subset of this BitVector.
|
||||
*
|
||||
* @param start
|
||||
* starting index, inclusive
|
||||
* @param end
|
||||
* ending index, exclusive
|
||||
* @return subset
|
||||
*/
|
||||
public BitVector subset(int start, int end) {
|
||||
if (start < 0 || end > size() || end < start)
|
||||
throw new IndexOutOfBoundsException();
|
||||
// Special case -- return empty vector is start == end
|
||||
if (end == start) return new BitVector(0);
|
||||
byte[] bits = new byte[((end - start - 1) >>> 3) + 1];
|
||||
int s = start >>> 3;
|
||||
for (int i = 0; i < bits.length; i++) {
|
||||
int cur = 0xFF & this.bits[i + s];
|
||||
int next = i + s + 1 >= this.bits.length ? 0 : 0xFF & this.bits[i + s + 1];
|
||||
bits[i] = (byte) ((cur >>> (start & 7)) | ((next << (8 - (start & 7)))));
|
||||
}
|
||||
int bitsToClear = (bits.length * 8 - (end - start)) % 8;
|
||||
bits[bits.length - 1] &= ~(0xFF << (8 - bitsToClear));
|
||||
return new BitVector(bits, end - start);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -211,70 +211,4 @@ public class TestBitVector extends LuceneTestCase
|
|||
}
|
||||
return equal;
|
||||
}
|
||||
|
||||
private static int[] subsetPattern = new int[] { 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1 };
|
||||
|
||||
/**
|
||||
* Tests BitVector.subset() against the above pattern
|
||||
*/
|
||||
public void testSubset() {
|
||||
doTestSubset(0, 0);
|
||||
doTestSubset(0, 20);
|
||||
doTestSubset(0, 7);
|
||||
doTestSubset(0, 8);
|
||||
doTestSubset(0, 9);
|
||||
doTestSubset(0, 15);
|
||||
doTestSubset(0, 16);
|
||||
doTestSubset(0, 17);
|
||||
doTestSubset(1, 7);
|
||||
doTestSubset(1, 8);
|
||||
doTestSubset(1, 9);
|
||||
doTestSubset(1, 15);
|
||||
doTestSubset(1, 16);
|
||||
doTestSubset(1, 17);
|
||||
doTestSubset(2, 20);
|
||||
doTestSubset(3, 20);
|
||||
doTestSubset(4, 20);
|
||||
doTestSubset(5, 20);
|
||||
doTestSubset(6, 20);
|
||||
doTestSubset(7, 14);
|
||||
doTestSubset(7, 15);
|
||||
doTestSubset(7, 16);
|
||||
doTestSubset(8, 15);
|
||||
doTestSubset(9, 20);
|
||||
doTestSubset(10, 20);
|
||||
doTestSubset(11, 20);
|
||||
doTestSubset(12, 20);
|
||||
doTestSubset(13, 20);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare a subset against the corresponding portion of the test pattern
|
||||
*/
|
||||
private void doTestSubset(int start, int end) {
|
||||
BitVector full = createSubsetTestVector();
|
||||
BitVector subset = full.subset(start, end);
|
||||
assertEquals(end - start, subset.size());
|
||||
int count = 0;
|
||||
for (int i = start, j = 0; i < end; i++, j++) {
|
||||
if (subsetPattern[i] == 1) {
|
||||
count++;
|
||||
assertTrue(subset.get(j));
|
||||
} else {
|
||||
assertFalse(subset.get(j));
|
||||
}
|
||||
}
|
||||
assertEquals(count, subset.count());
|
||||
}
|
||||
|
||||
private BitVector createSubsetTestVector() {
|
||||
BitVector bv = new BitVector(subsetPattern.length);
|
||||
for (int i = 0; i < subsetPattern.length; i++) {
|
||||
if (subsetPattern[i] == 1) {
|
||||
bv.set(i);
|
||||
}
|
||||
}
|
||||
return bv;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue