use globalOrd-localOrd delta when merging instead of huge int[]

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1440788 13f79535-47bb-0310-9956-ffa450edef68
2013-01-31 01:59:35 +00:00 · 2013-01-31 01:59:35 +00:00 · 3f0d6bab42
parent 4eec692bdf
commit 3f0d6bab42
3 changed files with 36 additions and 11 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
@ -253,11 +253,12 @@ public abstract class DocValuesConsumer implements Closeable {
      BytesRef scratch = new BytesRef();
      AppendingLongBuffer ordDeltas = new AppendingLongBuffer();
-      // nocommit can we factor out the compressed fields
+      // TODO: use another scheme?
-      // compression?  ie we have a good idea "roughly" what
+      // currently we +/- delta merged-ord from segment-ord (is this good? makes sense to me?)
      // but we have a good idea "roughly" what
      // the ord should be (linear projection) so we only
      // need to encode the delta from that ...:        
-      int[] segOrdToMergedOrd;
+      AppendingLongBuffer segOrdToMergedOrd = new AppendingLongBuffer();
      public BytesRef nextTerm() {
        while (ord < values.getValueCount()-1) {
@ -317,11 +318,6 @@ public abstract class DocValuesConsumer implements Closeable {
      TermMergeQueue q = new TermMergeQueue(segStates.size());
      for(SegmentState segState : segStates) {
        if (segState.nextTerm() != null) {
          // nocommit we could defer this to 3rd pass (and
          // reduce transient RAM spike) but then
          // we'd spend more effort computing the mapping...:
          segState.segOrdToMergedOrd = new int[segState.values.getValueCount()];
          q.add(segState);
        }
      }
@ -345,7 +341,12 @@ public abstract class DocValuesConsumer implements Closeable {
          ord++;
        }
-        top.segOrdToMergedOrd[top.ord] = ord-1;
+        long signedDelta = (ord-1) - top.ord; // global ord space - segment ord space
        // fill in any holes for unused ords, then finally the value we want (segOrdToMergedOrd[top.ord])
        // TODO: is there a better way...
        while (top.segOrdToMergedOrd.size() <= top.ord) {
          top.segOrdToMergedOrd.add(signedDelta);
        }
        if (top.nextTerm() == null) {
          q.pop();
        } else {
@ -494,7 +495,7 @@ public abstract class DocValuesConsumer implements Closeable {
                              if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) {
                                nextIsSet = true;
                                int segOrd = currentReader.values.getOrd(docIDUpto);
-                                nextValue = currentReader.segOrdToMergedOrd[segOrd];
+                                nextValue = (int) (segOrd + currentReader.segOrdToMergedOrd.get(segOrd));
                                docIDUpto++;
                                return true;
                              }
--- a/lucene/core/src/java/org/apache/lucene/util/packed/AppendingLongBuffer.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/AppendingLongBuffer.java
@ -29,7 +29,9 @@ import org.apache.lucene.util.RamUsageEstimator;
 */
 public class AppendingLongBuffer {
-  private static final int MAX_PENDING_COUNT = 1024;
+  private static final int BLOCK_BITS = 10;
  private static final int MAX_PENDING_COUNT = 1 << BLOCK_BITS;
  private static final int BLOCK_MASK = MAX_PENDING_COUNT - 1;
  private long[] minValues;
  private PackedInts.Reader[] values;
@ -55,6 +57,24 @@ public class AppendingLongBuffer {
    pending[pendingOff++] = l;
  }
  /** Get a value from this buffer. 
   *  <p>
   *  <b>NOTE</b>: This class is not really designed for random access!
   *  You will likely get better performance by using packed ints in another way! */
  public long get(int index) {
    assert index < size(); // TODO: do a better check, and throw IndexOutOfBoundsException?
                           // This class is currently only used by the indexer.
    int block = index >> BLOCK_BITS;
    int element = index & BLOCK_MASK;
    if (block == valuesOff) {
      return pending[element];
    } else if (values[block] == null) {
      return minValues[block];
    } else {
      return minValues[block] + values[block].get(element);
    }
  }
  private void packPendingValues() {
    assert pendingOff == MAX_PENDING_COUNT;
--- a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
+++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
@ -833,6 +833,10 @@ public class TestPackedInts extends LuceneTestCase {
      }
      assertFalse(it.hasNext());
      for (int i = 0; i < arr.length; ++i) {
        assertEquals(arr[i], buf.get(i));
      }
      final long expectedBytesUsed = RamUsageEstimator.sizeOf(buf);
      final long computedBytesUsed = buf.ramBytesUsed();
      assertEquals("got " + computedBytesUsed + ", expected: " + expectedBytesUsed,