LUCENE-3254: fix isSparse logic in BitVector; add version header to saved BitVector (*.del) files

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1140720 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-06-28 16:46:57 +00:00
parent 71d4dc370b
commit 5b56b5ee98
3 changed files with 79 additions and 116 deletions

View File

@ -489,6 +489,10 @@ Bug fixes
* LUCENE-3251: Directory#copy failed to close target output if opening the
source stream failed. (Simon Willnauer)
* LUCENE-3254: Fixed minor bug in deletes were written to disk,
causing the file to sometimes be larger than it needed to be. (Mike
McCandless)
Optimizations
* LUCENE-3201, LUCENE-3218: CompoundFileSystem code has been consolidated

View File

@ -24,14 +24,16 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
/** Optimized implementation of a vector of bits. This is more-or-less like
java.util.BitSet, but also includes the following:
<ul>
<li>a count() method, which efficiently computes the number of one bits;</li>
<li>optimized read from and write to disk;</li>
<li>inlinable get() method;</li>
<li>store and load, as bit set or d-gaps, depending on sparseness;</li>
</ul>
*/
* java.util.BitSet, but also includes the following:
* <ul>
* <li>a count() method, which efficiently computes the number of one bits;</li>
* <li>optimized read from and write to disk;</li>
* <li>inlinable get() method;</li>
* <li>store and load, as bit set or d-gaps, depending on sparseness;</li>
* </ul>
*
* @lucene.internal
*/
public final class BitVector implements Cloneable, Bits {
private byte[] bits;
@ -41,16 +43,24 @@ public final class BitVector implements Cloneable, Bits {
/** Constructs a vector capable of holding <code>n</code> bits. */
public BitVector(int n) {
size = n;
bits = new byte[(size >> 3) + 1];
bits = new byte[getNumBytes(size)];
count = 0;
}
BitVector(byte[] bits, int size) {
this.bits = bits;
this.size = size;
count = -1;
}
private int getNumBytes(int size) {
int bytesLength = size >>> 3;
if ((size & 7) != 0) {
bytesLength++;
}
return bytesLength;
}
@Override
public Object clone() {
byte[] copyBits = new byte[bits.length];
@ -158,6 +168,16 @@ public final class BitVector implements Cloneable, Bits {
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
private static String CODEC = "BitVector";
// Version before version tracking was added:
private final static int VERSION_PRE = -1;
// First version:
private final static int VERSION_START = 0;
// Increment version to change it:
private final static int VERSION_CURRENT = VERSION_START;
/** Writes this vector to the file <code>name</code> in Directory
<code>d</code>, in a format that can be read by the constructor {@link
@ -165,6 +185,8 @@ public final class BitVector implements Cloneable, Bits {
public final void write(Directory d, String name) throws IOException {
IndexOutput output = d.createOutput(name);
try {
output.writeInt(-2);
CodecUtil.writeHeader(output, CODEC, VERSION_CURRENT);
if (isSparse()) {
writeDgaps(output); // sparse bit-set more efficiently saved as d-gaps.
} else {
@ -202,19 +224,38 @@ public final class BitVector implements Cloneable, Bits {
/** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
private boolean isSparse() {
// note: order of comparisons below set to favor smaller values (no binary range search.)
// note: adding 4 because we start with ((int) -1) to indicate d-gaps format.
// note: we write the d-gap for the byte number, and the byte (bits[i]) itself, therefore
// multiplying count by (8+8) or (8+16) or (8+24) etc.:
// - first 8 for writing bits[i] (1 byte vs. 1 bit), and
// - second part for writing the byte-number d-gap as vint.
final int setCount = count();
if (setCount == 0) {
return true;
}
final int avgGapLength = bits.length / setCount;
// expected number of bytes for vInt encoding of each gap
final int expectedDGapBytes;
if (avgGapLength <= (1<< 7)) {
expectedDGapBytes = 1;
} else if (avgGapLength <= (1<<14)) {
expectedDGapBytes = 2;
} else if (avgGapLength <= (1<<21)) {
expectedDGapBytes = 3;
} else if (avgGapLength <= (1<<28)) {
expectedDGapBytes = 4;
} else {
expectedDGapBytes = 5;
}
// +1 because we write the byte itself that contains the
// set bit
final int bytesPerSetBit = expectedDGapBytes + 1;
// note: adding 32 because we start with ((int) -1) to indicate d-gaps format.
final long expectedBits = 32 + 8 * bytesPerSetBit * count();
// note: factor is for read/write of byte-arrays being faster than vints.
int factor = 10;
if (bits.length < (1<< 7)) return factor * (4 + (8+ 8)*count()) < size();
if (bits.length < (1<<14)) return factor * (4 + (8+16)*count()) < size();
if (bits.length < (1<<21)) return factor * (4 + (8+24)*count()) < size();
if (bits.length < (1<<28)) return factor * (4 + (8+32)*count()) < size();
return factor * (4 + (8+40)*count()) < size();
final long factor = 10;
return factor * expectedBits < size();
}
/** Constructs a bit vector from the file <code>name</code> in Directory
@ -222,8 +263,18 @@ public final class BitVector implements Cloneable, Bits {
*/
public BitVector(Directory d, String name) throws IOException {
IndexInput input = d.openInput(name);
try {
size = input.readInt(); // read size
final int firstInt = input.readInt();
final int version;
if (firstInt == -2) {
// New format, with full header & version:
version = CodecUtil.checkHeader(input, CODEC, VERSION_START, VERSION_START);
size = input.readInt();
} else {
version = VERSION_PRE;
size = firstInt;
}
if (size == -1) {
readDgaps(input);
} else {
@ -237,7 +288,7 @@ public final class BitVector implements Cloneable, Bits {
/** Read as a bit set */
private void readBits(IndexInput input) throws IOException {
count = input.readInt(); // read count
bits = new byte[(size >> 3) + 1]; // allocate bits
bits = new byte[getNumBytes(size)]; // allocate bits
input.readBytes(bits, 0, bits.length);
}
@ -254,30 +305,4 @@ public final class BitVector implements Cloneable, Bits {
n -= BYTE_COUNTS[bits[last] & 0xFF];
}
}
/**
* Retrieve a subset of this BitVector.
*
* @param start
* starting index, inclusive
* @param end
* ending index, exclusive
* @return subset
*/
public BitVector subset(int start, int end) {
if (start < 0 || end > size() || end < start)
throw new IndexOutOfBoundsException();
// Special case -- return empty vector is start == end
if (end == start) return new BitVector(0);
byte[] bits = new byte[((end - start - 1) >>> 3) + 1];
int s = start >>> 3;
for (int i = 0; i < bits.length; i++) {
int cur = 0xFF & this.bits[i + s];
int next = i + s + 1 >= this.bits.length ? 0 : 0xFF & this.bits[i + s + 1];
bits[i] = (byte) ((cur >>> (start & 7)) | ((next << (8 - (start & 7)))));
}
int bitsToClear = (bits.length * 8 - (end - start)) % 8;
bits[bits.length - 1] &= ~(0xFF << (8 - bitsToClear));
return new BitVector(bits, end - start);
}
}

View File

@ -211,70 +211,4 @@ public class TestBitVector extends LuceneTestCase
}
return equal;
}
private static int[] subsetPattern = new int[] { 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1 };
/**
* Tests BitVector.subset() against the above pattern
*/
public void testSubset() {
doTestSubset(0, 0);
doTestSubset(0, 20);
doTestSubset(0, 7);
doTestSubset(0, 8);
doTestSubset(0, 9);
doTestSubset(0, 15);
doTestSubset(0, 16);
doTestSubset(0, 17);
doTestSubset(1, 7);
doTestSubset(1, 8);
doTestSubset(1, 9);
doTestSubset(1, 15);
doTestSubset(1, 16);
doTestSubset(1, 17);
doTestSubset(2, 20);
doTestSubset(3, 20);
doTestSubset(4, 20);
doTestSubset(5, 20);
doTestSubset(6, 20);
doTestSubset(7, 14);
doTestSubset(7, 15);
doTestSubset(7, 16);
doTestSubset(8, 15);
doTestSubset(9, 20);
doTestSubset(10, 20);
doTestSubset(11, 20);
doTestSubset(12, 20);
doTestSubset(13, 20);
}
/**
* Compare a subset against the corresponding portion of the test pattern
*/
private void doTestSubset(int start, int end) {
BitVector full = createSubsetTestVector();
BitVector subset = full.subset(start, end);
assertEquals(end - start, subset.size());
int count = 0;
for (int i = start, j = 0; i < end; i++, j++) {
if (subsetPattern[i] == 1) {
count++;
assertTrue(subset.get(j));
} else {
assertFalse(subset.get(j));
}
}
assertEquals(count, subset.count());
}
private BitVector createSubsetTestVector() {
BitVector bv = new BitVector(subsetPattern.length);
for (int i = 0; i < subsetPattern.length; i++) {
if (subsetPattern[i] == 1) {
bv.set(i);
}
}
return bv;
}
}