use globalOrd-localOrd delta when merging instead of huge int[]

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1440788 13f79535-47bb-0310-9956-ffa450edef68
2025-03-06 08:19:23 +00:00 · 2013-01-31 01:59:35 +00:00 · 2013-01-31 01:59:35 +00:00 · 3f0d6bab42
commit 3f0d6bab42
parent 4eec692bdf
3 changed files with 36 additions and 11 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
@ -253,11 +253,12 @@ public abstract class DocValuesConsumer implements Closeable {
      BytesRef scratch = new BytesRef();
      AppendingLongBuffer ordDeltas = new AppendingLongBuffer();

-      // nocommit can we factor out the compressed fields
-      // compression?  ie we have a good idea "roughly" what
+      // TODO: use another scheme?
+      // currently we +/- delta merged-ord from segment-ord (is this good? makes sense to me?)
+      // but we have a good idea "roughly" what
      // the ord should be (linear projection) so we only
      // need to encode the delta from that ...:        
-      int[] segOrdToMergedOrd;
+      AppendingLongBuffer segOrdToMergedOrd = new AppendingLongBuffer();

      public BytesRef nextTerm() {
        while (ord < values.getValueCount()-1) {
@ -317,11 +318,6 @@ public abstract class DocValuesConsumer implements Closeable {
      TermMergeQueue q = new TermMergeQueue(segStates.size());
      for(SegmentState segState : segStates) {
        if (segState.nextTerm() != null) {
-
-          // nocommit we could defer this to 3rd pass (and
-          // reduce transient RAM spike) but then
-          // we'd spend more effort computing the mapping...:
-          segState.segOrdToMergedOrd = new int[segState.values.getValueCount()];
          q.add(segState);
        }
      }
@ -345,7 +341,12 @@ public abstract class DocValuesConsumer implements Closeable {
          ord++;
        }

-        top.segOrdToMergedOrd[top.ord] = ord-1;
+        long signedDelta = (ord-1) - top.ord; // global ord space - segment ord space
+        // fill in any holes for unused ords, then finally the value we want (segOrdToMergedOrd[top.ord])
+        // TODO: is there a better way...
+        while (top.segOrdToMergedOrd.size() <= top.ord) {
+          top.segOrdToMergedOrd.add(signedDelta);
+        }
        if (top.nextTerm() == null) {
          q.pop();
        } else {
@ -494,7 +495,7 @@ public abstract class DocValuesConsumer implements Closeable {
                              if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) {
                                nextIsSet = true;
                                int segOrd = currentReader.values.getOrd(docIDUpto);
-                                nextValue = currentReader.segOrdToMergedOrd[segOrd];
+                                nextValue = (int) (segOrd + currentReader.segOrdToMergedOrd.get(segOrd));
                                docIDUpto++;
                                return true;
                              }
--- a/lucene/core/src/java/org/apache/lucene/util/packed/AppendingLongBuffer.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/AppendingLongBuffer.java
@ -29,7 +29,9 @@ import org.apache.lucene.util.RamUsageEstimator;
 */
 public class AppendingLongBuffer {

-  private static final int MAX_PENDING_COUNT = 1024;
+  private static final int BLOCK_BITS = 10;
+  private static final int MAX_PENDING_COUNT = 1 << BLOCK_BITS;
+  private static final int BLOCK_MASK = MAX_PENDING_COUNT - 1;

  private long[] minValues;
  private PackedInts.Reader[] values;
@ -54,6 +56,24 @@ public class AppendingLongBuffer {
    }
    pending[pendingOff++] = l;
  }
+  
+  /** Get a value from this buffer. 
+   *  <p>
+   *  <b>NOTE</b>: This class is not really designed for random access!
+   *  You will likely get better performance by using packed ints in another way! */
+  public long get(int index) {
+    assert index < size(); // TODO: do a better check, and throw IndexOutOfBoundsException?
+                           // This class is currently only used by the indexer.
+    int block = index >> BLOCK_BITS;
+    int element = index & BLOCK_MASK;
+    if (block == valuesOff) {
+      return pending[element];
+    } else if (values[block] == null) {
+      return minValues[block];
+    } else {
+      return minValues[block] + values[block].get(element);
+    }
+  }

  private void packPendingValues() {
    assert pendingOff == MAX_PENDING_COUNT;
--- a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
+++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
@ -832,6 +832,10 @@ public class TestPackedInts extends LuceneTestCase {
        assertEquals(arr[i], it.next());
      }
      assertFalse(it.hasNext());
+      
+      for (int i = 0; i < arr.length; ++i) {
+        assertEquals(arr[i], buf.get(i));
+      }

      final long expectedBytesUsed = RamUsageEstimator.sizeOf(buf);
      final long computedBytesUsed = buf.ramBytesUsed();