LUCENE-3295: fix several issues in BitVector.writeClearedDgaps

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1144942 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-07-10 19:25:11 +00:00
parent 31602b986b
commit 807dad7038
3 changed files with 91 additions and 12 deletions

View File

@ -213,7 +213,6 @@ public class MemoryCodec extends Codec {
System.out.println(" " + Integer.toHexString(finalBuffer[i]&0xFF)); System.out.println(" " + Integer.toHexString(finalBuffer[i]&0xFF));
} }
} }
builder.add(text, new BytesRef(spare)); builder.add(text, new BytesRef(spare));
termCount++; termCount++;
} }

View File

@ -162,6 +162,7 @@ public final class BitVector implements Cloneable, Bits {
} }
count = c; count = c;
} }
assert count <= size: "count=" + count + " size=" + size;
return count; return count;
} }
@ -227,6 +228,7 @@ public final class BitVector implements Cloneable, Bits {
} else { } else {
writeBits(output); writeBits(output);
} }
assert verifyCount();
} finally { } finally {
output.close(); output.close();
} }
@ -278,14 +280,13 @@ public final class BitVector implements Cloneable, Bits {
output.writeInt(count()); // write count output.writeInt(count()); // write count
int last=0; int last=0;
int numCleared = size()-count(); int numCleared = size()-count();
int m = bits.length; for (int i=0; i<bits.length && numCleared>0; i++) {
for (int i=0; i<m && numCleared>0; i++) { if (bits[i] != (byte) 0xff) {
if (bits[i]!=0xff) {
output.writeVInt(i-last); output.writeVInt(i-last);
output.writeByte(bits[i]); output.writeByte(bits[i]);
last = i; last = i;
numCleared -= (8-BYTE_COUNTS[bits[i] & 0xFF]); numCleared -= (8-BYTE_COUNTS[bits[i] & 0xFF]);
assert numCleared >= 0; assert numCleared >= 0 || (i == (bits.length-1) && numCleared == -(8-(size&7)));
} }
} }
} }
@ -319,7 +320,7 @@ public final class BitVector implements Cloneable, Bits {
final int bytesPerSetBit = expectedDGapBytes + 1; final int bytesPerSetBit = expectedDGapBytes + 1;
// note: adding 32 because we start with ((int) -1) to indicate d-gaps format. // note: adding 32 because we start with ((int) -1) to indicate d-gaps format.
final long expectedBits = 32 + 8 * bytesPerSetBit * count(); final long expectedBits = 32 + 8 * bytesPerSetBit * clearedCount;
// note: factor is for read/write of byte-arrays being faster than vints. // note: factor is for read/write of byte-arrays being faster than vints.
final long factor = 10; final long factor = 10;
@ -352,11 +353,21 @@ public final class BitVector implements Cloneable, Bits {
} else { } else {
readBits(input); readBits(input);
} }
assert verifyCount();
} finally { } finally {
input.close(); input.close();
} }
} }
// asserts only
private boolean verifyCount() {
assert count != -1;
final int countSav = count;
count = -1;
assert countSav == count(): "saved count was " + countSav + " but recomputed count is " + count;
return true;
}
/** Read as a bit set */ /** Read as a bit set */
private void readBits(IndexInput input) throws IOException { private void readBits(IndexInput input) throws IOException {
count = input.readInt(); // read count count = input.readInt(); // read count
@ -368,7 +379,7 @@ public final class BitVector implements Cloneable, Bits {
private void readSetDgaps(IndexInput input) throws IOException { private void readSetDgaps(IndexInput input) throws IOException {
size = input.readInt(); // (re)read size size = input.readInt(); // (re)read size
count = input.readInt(); // read count count = input.readInt(); // read count
bits = new byte[(size >> 3) + 1]; // allocate bits bits = new byte[getNumBytes(size)]; // allocate bits
int last=0; int last=0;
int n = count(); int n = count();
while (n>0) { while (n>0) {
@ -383,7 +394,7 @@ public final class BitVector implements Cloneable, Bits {
private void readClearedDgaps(IndexInput input) throws IOException { private void readClearedDgaps(IndexInput input) throws IOException {
size = input.readInt(); // (re)read size size = input.readInt(); // (re)read size
count = input.readInt(); // read count count = input.readInt(); // read count
bits = new byte[(size >> 3) + 1]; // allocate bits bits = new byte[getNumBytes(size)]; // allocate bits
Arrays.fill(bits, (byte) 0xff); Arrays.fill(bits, (byte) 0xff);
clearUnusedBits(); clearUnusedBits();
int last=0; int last=0;
@ -392,7 +403,7 @@ public final class BitVector implements Cloneable, Bits {
last += input.readVInt(); last += input.readVInt();
bits[last] = input.readByte(); bits[last] = input.readByte();
numCleared -= 8-BYTE_COUNTS[bits[last] & 0xFF]; numCleared -= 8-BYTE_COUNTS[bits[last] & 0xFF];
assert numCleared >= 0; assert numCleared >= 0 || (last == (bits.length-1) && numCleared == -(8-(size&7)));
} }
} }
} }

View File

@ -19,7 +19,7 @@ package org.apache.lucene.util;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.store.IOContext; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MockDirectoryWrapper; import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.RAMDirectory;
@ -165,6 +165,26 @@ public class TestBitVector extends LuceneTestCase
doTestDgaps(10000,40,43); doTestDgaps(10000,40,43);
doTestDgaps(100000,415,418); doTestDgaps(100000,415,418);
doTestDgaps(1000000,3123,3126); doTestDgaps(1000000,3123,3126);
// now exercise skipping of fully populated byte in the bitset (they are omitted if bitset is sparse)
MockDirectoryWrapper d = new MockDirectoryWrapper(random, new RAMDirectory());
d.setPreventDoubleWrite(false);
BitVector bv = new BitVector(10000);
bv.set(0);
for (int i = 8; i < 16; i++) {
bv.set(i);
} // make sure we have once byte full of set bits
for (int i = 32; i < 40; i++) {
bv.set(i);
} // get a second byte full of set bits
// add some more bits here
for (int i = 40; i < 10000; i++) {
if (random.nextInt(1000) == 0) {
bv.set(i);
}
}
bv.write(d, "TESTBV", newIOContext(random));
BitVector compare = new BitVector(d, "TESTBV", newIOContext(random));
assertTrue(doCompare(bv,compare));
} }
private void doTestDgaps(int size, int count1, int count2) throws IOException { private void doTestDgaps(int size, int count1, int count2) throws IOException {
@ -183,7 +203,7 @@ public class TestBitVector extends LuceneTestCase
assertTrue(doCompare(bv,bv2)); assertTrue(doCompare(bv,bv2));
bv = bv2; bv = bv2;
bv.clear(i); bv.clear(i);
assertEquals(i+1,size-bv.count()); assertEquals(i+1, size-bv.count());
bv.write(d, "TESTBV", newIOContext(random)); bv.write(d, "TESTBV", newIOContext(random));
} }
// now start decreasing number of set bits // now start decreasing number of set bits
@ -196,6 +216,54 @@ public class TestBitVector extends LuceneTestCase
bv.write(d, "TESTBV", newIOContext(random)); bv.write(d, "TESTBV", newIOContext(random));
} }
} }
public void testSparseWrite() throws IOException {
Directory d = newDirectory();
final int numBits = 10240;
BitVector bv = new BitVector(numBits);
bv.invertAll();
int numToClear = random.nextInt(5);
for(int i=0;i<numToClear;i++) {
bv.clear(random.nextInt(numBits));
}
bv.write(d, "test", newIOContext(random));
final long size = d.fileLength("test");
assertTrue("size=" + size, size < 100);
d.close();
}
public void testClearedBitNearEnd() throws IOException {
Directory d = newDirectory();
final int numBits = _TestUtil.nextInt(random, 7, 1000);
BitVector bv = new BitVector(numBits);
bv.invertAll();
bv.clear(numBits-_TestUtil.nextInt(random, 1, 7));
bv.write(d, "test", newIOContext(random));
assertEquals(numBits-1, bv.count());
d.close();
}
public void testMostlySet() throws IOException {
Directory d = newDirectory();
final int numBits = _TestUtil.nextInt(random, 30, 1000);
for(int numClear=0;numClear<20;numClear++) {
BitVector bv = new BitVector(numBits);
bv.invertAll();
int count = 0;
while(count < numClear) {
final int bit = random.nextInt(numBits);
// Don't use getAndClear, so that count is recomputed
if (bv.get(bit)) {
bv.clear(bit);
count++;
assertEquals(numBits-count, bv.count());
}
}
}
d.close();
}
/** /**
* Compare two BitVectors. * Compare two BitVectors.
* This should really be an equals method on the BitVector itself. * This should really be an equals method on the BitVector itself.
@ -211,6 +279,7 @@ public class TestBitVector extends LuceneTestCase
break; break;
} }
} }
assertEquals(bv.count(), compare.count());
return equal; return equal;
} }
} }