LUCENE-5780: Make OrdinalMap more memory-efficient.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1604158 13f79535-47bb-0310-9956-ffa450edef68
2014-06-20 12:22:31 +00:00 · 2014-06-20 12:22:31 +00:00 · e97145a1dd
parent 0e9d6de916
commit e97145a1dd
2 changed files with 17 additions and 6 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -96,12 +96,15 @@ Other

 ======================= Lucene 4.10.0 ======================

-(No Changes)
-
 API Changes

 * LUCENE-5752: Simplified Automaton API to be immutable. (Mike McCandless)

+Optimizations
+
+* LUCENE-5780: Make OrdinalMap more memory-efficient, especially in case the
+  first segment has all values. (Adrien Grand, Robert Muir)
+
 ======================= Lucene 4.9.0 =======================

 Changes in Runtime Behavior
--- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
@ -427,14 +427,18 @@ public class MultiDocValues {
      long globalOrd = 0;
      while (mte.next() != null) {        
        TermsEnumWithSlice matches[] = mte.getMatchArray();
+        int firstSegmentIndex = Integer.MAX_VALUE;
+        long globalOrdDelta = Long.MAX_VALUE;
        for (int i = 0; i < mte.getMatchCount(); i++) {
          int segmentIndex = matches[i].index;
          long segmentOrd = matches[i].terms.ord();
          long delta = globalOrd - segmentOrd;
-          // for each unique term, just mark the first segment index/delta where it occurs
-          if (i == 0) {
-            firstSegments.add(segmentIndex);
-            globalOrdDeltas.add(delta);
+          // We compute the least segment where the term occurs. In case the
+          // first segment contains most (or better all) values, this will
+          // help save significant memory
+          if (segmentIndex < firstSegmentIndex) {
+            firstSegmentIndex = segmentIndex;
+            globalOrdDelta = delta;
          }
          // for each per-segment ord, map it back to the global term.
          while (segmentOrds[segmentIndex] <= segmentOrd) {
@ -443,6 +447,10 @@ public class MultiDocValues {
            segmentOrds[segmentIndex]++;
          }
        }
+        // for each unique term, just mark the first segment index/delta where it occurs
+        assert firstSegmentIndex < segmentOrds.length;
+        firstSegments.add(firstSegmentIndex);
+        globalOrdDeltas.add(globalOrdDelta);
        globalOrd++;
      }
      firstSegments.freeze();