LUCENE-3254: fix isSparse logic in BitVector; add version header to saved BitVector (*.del) files

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1140720 13f79535-47bb-0310-9956-ffa450edef68
2011-06-28 16:46:57 +00:00 · 2011-06-28 16:46:57 +00:00 · 5b56b5ee98
parent 71d4dc370b
commit 5b56b5ee98
3 changed files with 79 additions and 116 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -489,6 +489,10 @@ Bug fixes
 * LUCENE-3251: Directory#copy failed to close target output if opening the
  source stream failed. (Simon Willnauer)

+* LUCENE-3254: Fixed minor bug in deletes were written to disk,
+  causing the file to sometimes be larger than it needed to be.  (Mike
+  McCandless)
+
 Optimizations

 * LUCENE-3201, LUCENE-3218: CompoundFileSystem code has been consolidated 
--- a/lucene/src/java/org/apache/lucene/util/BitVector.java
+++ b/lucene/src/java/org/apache/lucene/util/BitVector.java
@ -24,14 +24,16 @@ import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.IndexOutput;

 /** Optimized implementation of a vector of bits.  This is more-or-less like
-  java.util.BitSet, but also includes the following:
-  <ul>
-  <li>a count() method, which efficiently computes the number of one bits;</li>
-  <li>optimized read from and write to disk;</li>
-  <li>inlinable get() method;</li>
-  <li>store and load, as bit set or d-gaps, depending on sparseness;</li> 
-  </ul>
-  */
+ *  java.util.BitSet, but also includes the following:
+ *  <ul>
+ *  <li>a count() method, which efficiently computes the number of one bits;</li>
+ *  <li>optimized read from and write to disk;</li>
+ *  <li>inlinable get() method;</li>
+ *  <li>store and load, as bit set or d-gaps, depending on sparseness;</li> 
+ *  </ul>
+ *
+ *  @lucene.internal
+ */
 public final class BitVector implements Cloneable, Bits {

  private byte[] bits;
@ -41,16 +43,24 @@ public final class BitVector implements Cloneable, Bits {
  /** Constructs a vector capable of holding <code>n</code> bits. */
  public BitVector(int n) {
    size = n;
-    bits = new byte[(size >> 3) + 1];
+    bits = new byte[getNumBytes(size)];
    count = 0;
  }
-  
+
  BitVector(byte[] bits, int size) {
    this.bits = bits;
    this.size = size;
    count = -1;
  }
  
+  private int getNumBytes(int size) {
+    int bytesLength = size >>> 3;
+    if ((size & 7) != 0) {
+      bytesLength++;
+    }
+    return bytesLength;
+  }
+  
  @Override
  public Object clone() {
    byte[] copyBits = new byte[bits.length];
@ -158,6 +168,16 @@ public final class BitVector implements Cloneable, Bits {
    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
  };

+  private static String CODEC = "BitVector";
+
+  // Version before version tracking was added:
+  private final static int VERSION_PRE = -1;
+
+  // First version:
+  private final static int VERSION_START = 0;
+
+  // Increment version to change it:
+  private final static int VERSION_CURRENT = VERSION_START;

  /** Writes this vector to the file <code>name</code> in Directory
    <code>d</code>, in a format that can be read by the constructor {@link
@ -165,6 +185,8 @@ public final class BitVector implements Cloneable, Bits {
  public final void write(Directory d, String name) throws IOException {
    IndexOutput output = d.createOutput(name);
    try {
+      output.writeInt(-2);
+      CodecUtil.writeHeader(output, CODEC, VERSION_CURRENT);
      if (isSparse()) { 
        writeDgaps(output); // sparse bit-set more efficiently saved as d-gaps.
      } else {
@ -202,19 +224,38 @@ public final class BitVector implements Cloneable, Bits {

  /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
  private boolean isSparse() {
-    // note: order of comparisons below set to favor smaller values (no binary range search.)
-    // note: adding 4 because we start with ((int) -1) to indicate d-gaps format.
-    // note: we write the d-gap for the byte number, and the byte (bits[i]) itself, therefore
-    //       multiplying count by (8+8) or (8+16) or (8+24) etc.:
-    //       - first 8 for writing bits[i] (1 byte vs. 1 bit), and 
-    //       - second part for writing the byte-number d-gap as vint. 
+
+    final int setCount = count();
+    if (setCount == 0) {
+      return true;
+    }
+
+    final int avgGapLength = bits.length / setCount;
+
+    // expected number of bytes for vInt encoding of each gap
+    final int expectedDGapBytes;
+    if (avgGapLength <= (1<< 7)) {
+      expectedDGapBytes = 1;
+    } else if (avgGapLength <= (1<<14)) {
+      expectedDGapBytes = 2;
+    } else if (avgGapLength <= (1<<21)) {
+      expectedDGapBytes = 3;
+    } else if (avgGapLength <= (1<<28)) {
+      expectedDGapBytes = 4;
+    } else {
+      expectedDGapBytes = 5;
+    }
+
+    // +1 because we write the byte itself that contains the
+    // set bit
+    final int bytesPerSetBit = expectedDGapBytes + 1;
+    
+    // note: adding 32 because we start with ((int) -1) to indicate d-gaps format.
+    final long expectedBits = 32 + 8 * bytesPerSetBit * count();
+
    // note: factor is for read/write of byte-arrays being faster than vints.  
-    int factor = 10;  
-    if (bits.length < (1<< 7)) return factor * (4 + (8+ 8)*count()) < size();
-    if (bits.length < (1<<14)) return factor * (4 + (8+16)*count()) < size();
-    if (bits.length < (1<<21)) return factor * (4 + (8+24)*count()) < size();
-    if (bits.length < (1<<28)) return factor * (4 + (8+32)*count()) < size();
-    return                            factor * (4 + (8+40)*count()) < size();
+    final long factor = 10;  
+    return factor * expectedBits < size();
  }

  /** Constructs a bit vector from the file <code>name</code> in Directory
@ -222,8 +263,18 @@ public final class BitVector implements Cloneable, Bits {
    */
  public BitVector(Directory d, String name) throws IOException {
    IndexInput input = d.openInput(name);
+
    try {
-      size = input.readInt();       // read size
+      final int firstInt = input.readInt();
+      final int version;
+      if (firstInt == -2) {
+        // New format, with full header & version:
+        version = CodecUtil.checkHeader(input, CODEC, VERSION_START, VERSION_START);
+        size = input.readInt();
+      } else {
+        version = VERSION_PRE;
+        size = firstInt;
+      }
      if (size == -1) {
        readDgaps(input);
      } else {
@ -237,7 +288,7 @@ public final class BitVector implements Cloneable, Bits {
  /** Read as a bit set */
  private void readBits(IndexInput input) throws IOException {
    count = input.readInt();        // read count
-    bits = new byte[(size >> 3) + 1];     // allocate bits
+    bits = new byte[getNumBytes(size)];     // allocate bits
    input.readBytes(bits, 0, bits.length);
  }

@ -254,30 +305,4 @@ public final class BitVector implements Cloneable, Bits {
      n -= BYTE_COUNTS[bits[last] & 0xFF];
    }          
  }
-
-  /**
-   * Retrieve a subset of this BitVector.
-   * 
-   * @param start
-   *            starting index, inclusive
-   * @param end
-   *            ending index, exclusive
-   * @return subset
-   */
-  public BitVector subset(int start, int end) {
-    if (start < 0 || end > size() || end < start)
-      throw new IndexOutOfBoundsException();
-    // Special case -- return empty vector is start == end
-    if (end == start) return new BitVector(0);
-    byte[] bits = new byte[((end - start - 1) >>> 3) + 1];
-    int s = start >>> 3;
-    for (int i = 0; i < bits.length; i++) {
-      int cur = 0xFF & this.bits[i + s];
-      int next = i + s + 1 >= this.bits.length ? 0 : 0xFF & this.bits[i + s + 1];
-      bits[i] = (byte) ((cur >>> (start & 7)) | ((next << (8 - (start & 7)))));
-    }
-    int bitsToClear = (bits.length * 8 - (end - start)) % 8;
-    bits[bits.length - 1] &= ~(0xFF << (8 - bitsToClear));
-    return new BitVector(bits, end - start);
-  }
 }
--- a/lucene/src/test/org/apache/lucene/util/TestBitVector.java
+++ b/lucene/src/test/org/apache/lucene/util/TestBitVector.java
@ -211,70 +211,4 @@ public class TestBitVector extends LuceneTestCase
        }
        return equal;
    }
-    
-    private static int[] subsetPattern = new int[] { 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1 };
-    
-    /**
-     * Tests BitVector.subset() against the above pattern
-     */
-    public void testSubset() {
-    	doTestSubset(0, 0);
-    	doTestSubset(0, 20);
-    	doTestSubset(0, 7);
-    	doTestSubset(0, 8);
-    	doTestSubset(0, 9);
-    	doTestSubset(0, 15);
-    	doTestSubset(0, 16);
-    	doTestSubset(0, 17);
-    	doTestSubset(1, 7);
-    	doTestSubset(1, 8);
-    	doTestSubset(1, 9);
-    	doTestSubset(1, 15);
-    	doTestSubset(1, 16);
-    	doTestSubset(1, 17);
-    	doTestSubset(2, 20);
-    	doTestSubset(3, 20);
-    	doTestSubset(4, 20);
-    	doTestSubset(5, 20);
-    	doTestSubset(6, 20);
-    	doTestSubset(7, 14);
-    	doTestSubset(7, 15);
-    	doTestSubset(7, 16);
-    	doTestSubset(8, 15);
-    	doTestSubset(9, 20);
-    	doTestSubset(10, 20);
-    	doTestSubset(11, 20);
-    	doTestSubset(12, 20);
-    	doTestSubset(13, 20);
-    }
-    
-    /**
-     * Compare a subset against the corresponding portion of the test pattern
-     */
-    private void doTestSubset(int start, int end) {
-    	BitVector full = createSubsetTestVector();
-    	BitVector subset = full.subset(start, end);
-    	assertEquals(end - start, subset.size());
-    	int count = 0;
-    	for (int i = start, j = 0; i < end; i++, j++) {
-    		if (subsetPattern[i] == 1) {
-    			count++;
-    			assertTrue(subset.get(j));
-    		} else {
-    			assertFalse(subset.get(j));
-    		}
-    	}
-    	assertEquals(count, subset.count());
-    }
-    
-    private BitVector createSubsetTestVector() {
-    	BitVector bv = new BitVector(subsetPattern.length);
-    	for (int i = 0; i < subsetPattern.length; i++) {
-    		if (subsetPattern[i] == 1) {
-    			bv.set(i);
-    		}
-    	}
-    	return bv;
-    }
-    
 }