From e97145a1ddca855f7bc4f1d0fdd3e23348f38448 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 20 Jun 2014 12:22:31 +0000 Subject: [PATCH] LUCENE-5780: Make OrdinalMap more memory-efficient. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1604158 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 7 +++++-- .../org/apache/lucene/index/MultiDocValues.java | 16 ++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1f54db01e39..6a48d22bc07 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -96,12 +96,15 @@ Other ======================= Lucene 4.10.0 ====================== -(No Changes) - API Changes * LUCENE-5752: Simplified Automaton API to be immutable. (Mike McCandless) +Optimizations + +* LUCENE-5780: Make OrdinalMap more memory-efficient, especially in case the + first segment has all values. (Adrien Grand, Robert Muir) + ======================= Lucene 4.9.0 ======================= Changes in Runtime Behavior diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java index 23757e0197b..9ad9026f0a9 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java @@ -427,14 +427,18 @@ public class MultiDocValues { long globalOrd = 0; while (mte.next() != null) { TermsEnumWithSlice matches[] = mte.getMatchArray(); + int firstSegmentIndex = Integer.MAX_VALUE; + long globalOrdDelta = Long.MAX_VALUE; for (int i = 0; i < mte.getMatchCount(); i++) { int segmentIndex = matches[i].index; long segmentOrd = matches[i].terms.ord(); long delta = globalOrd - segmentOrd; - // for each unique term, just mark the first segment index/delta where it occurs - if (i == 0) { - firstSegments.add(segmentIndex); - globalOrdDeltas.add(delta); + // We compute the least segment where the term occurs. In case the + // first segment contains most (or better all) values, this will + // help save significant memory + if (segmentIndex < firstSegmentIndex) { + firstSegmentIndex = segmentIndex; + globalOrdDelta = delta; } // for each per-segment ord, map it back to the global term. while (segmentOrds[segmentIndex] <= segmentOrd) { @@ -443,6 +447,10 @@ public class MultiDocValues { segmentOrds[segmentIndex]++; } } + // for each unique term, just mark the first segment index/delta where it occurs + assert firstSegmentIndex < segmentOrds.length; + firstSegments.add(firstSegmentIndex); + globalOrdDeltas.add(globalOrdDelta); globalOrd++; } firstSegments.freeze();