diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4006634cf55..899f406d67f 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -110,6 +110,9 @@ Optimizations
* LUCENE-5780: Make OrdinalMap more memory-efficient, especially in case the
first segment has all values. (Adrien Grand, Robert Muir)
+* LUCENE-5782: OrdinalMap now sorts enums before being built in order to
+ improve compression. (Adrien Grand)
+
======================= Lucene 4.9.0 =======================
Changes in Runtime Behavior
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
index 6e1cd9d71f2..3b187bc910b 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
@@ -40,6 +40,7 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongBitSet;
import org.apache.lucene.util.LongValues;
+import org.apache.lucene.util.packed.PackedInts;
/**
* Abstract API that consumes numeric, binary and
@@ -440,12 +441,14 @@ public abstract class DocValuesConsumer implements Closeable {
// step 1: iterate thru each sub and mark terms still in use
TermsEnum liveTerms[] = new TermsEnum[dvs.length];
+ long[] weights = new long[liveTerms.length];
for (int sub = 0; sub < liveTerms.length; sub++) {
AtomicReader reader = readers[sub];
SortedDocValues dv = dvs[sub];
Bits liveDocs = reader.getLiveDocs();
if (liveDocs == null) {
liveTerms[sub] = dv.termsEnum();
+ weights[sub] = dv.getValueCount();
} else {
LongBitSet bitset = new LongBitSet(dv.getValueCount());
for (int i = 0; i < reader.maxDoc(); i++) {
@@ -457,11 +460,12 @@ public abstract class DocValuesConsumer implements Closeable {
}
}
liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
+ weights[sub] = bitset.cardinality();
}
}
// step 2: create ordinal map (this conceptually does the "merging")
- final OrdinalMap map = new OrdinalMap(this, liveTerms);
+ final OrdinalMap map = OrdinalMap.build(this, liveTerms, weights, PackedInts.COMPACT);
// step 3: add field
addSortedField(fieldInfo,
@@ -576,12 +580,14 @@ public abstract class DocValuesConsumer implements Closeable {
// step 1: iterate thru each sub and mark terms still in use
TermsEnum liveTerms[] = new TermsEnum[dvs.length];
+ long[] weights = new long[liveTerms.length];
for (int sub = 0; sub < liveTerms.length; sub++) {
AtomicReader reader = readers[sub];
SortedSetDocValues dv = dvs[sub];
Bits liveDocs = reader.getLiveDocs();
if (liveDocs == null) {
liveTerms[sub] = dv.termsEnum();
+ weights[sub] = dv.getValueCount();
} else {
LongBitSet bitset = new LongBitSet(dv.getValueCount());
for (int i = 0; i < reader.maxDoc(); i++) {
@@ -594,11 +600,12 @@ public abstract class DocValuesConsumer implements Closeable {
}
}
liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
+ weights[sub] = bitset.cardinality();
}
}
// step 2: create ordinal map (this conceptually does the "merging")
- final OrdinalMap map = new OrdinalMap(this, liveTerms);
+ final OrdinalMap map = OrdinalMap.build(this, liveTerms, weights, PackedInts.COMPACT);
// step 3: add field
addSortedSetField(fieldInfo,
diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
index 9ad9026f0a9..ad169ab506f 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
@@ -18,6 +18,7 @@ package org.apache.lucene.index;
*/
import java.io.IOException;
+import java.util.Arrays;
import java.util.List;
import org.apache.lucene.index.MultiTermsEnum.TermsEnumIndex;
@@ -25,6 +26,7 @@ import org.apache.lucene.index.MultiTermsEnum.TermsEnumWithSlice;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.AppendingPackedLongBuffer;
@@ -322,11 +324,7 @@ public class MultiDocValues {
if (!anyReal) {
return null;
} else {
- TermsEnum enums[] = new TermsEnum[values.length];
- for (int i = 0; i < values.length; i++) {
- enums[i] = values[i].termsEnum();
- }
- OrdinalMap mapping = new OrdinalMap(r.getCoreCacheKey(), enums);
+ OrdinalMap mapping = OrdinalMap.build(r.getCoreCacheKey(), values, PackedInts.DEFAULT);
return new MultiSortedDocValues(values, starts, mapping);
}
}
@@ -366,20 +364,125 @@ public class MultiDocValues {
if (!anyReal) {
return null;
} else {
- TermsEnum enums[] = new TermsEnum[values.length];
- for (int i = 0; i < values.length; i++) {
- enums[i] = values[i].termsEnum();
- }
- OrdinalMap mapping = new OrdinalMap(r.getCoreCacheKey(), enums);
+ OrdinalMap mapping = OrdinalMap.build(r.getCoreCacheKey(), values, PackedInts.DEFAULT);
return new MultiSortedSetDocValues(values, starts, mapping);
}
}
/** maps per-segment ordinals to/from global ordinal space */
+ // TODO: we could also have a utility method to merge Terms[] and use size() as a weight when we need it
// TODO: use more efficient packed ints structures?
// TODO: pull this out? its pretty generic (maps between N ord()-enabled TermsEnums)
public static class OrdinalMap implements Accountable {
+ private static class SegmentMap implements Accountable {
+ private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(SegmentMap.class);
+
+ /** Build a map from an index into a sorted view of `weights` to an index into `weights`. */
+ private static int[] map(final long[] weights) {
+ final int[] newToOld = new int[weights.length];
+ for (int i = 0; i < weights.length; ++i) {
+ newToOld[i] = i;
+ }
+ new InPlaceMergeSorter() {
+ @Override
+ protected void swap(int i, int j) {
+ final int tmp = newToOld[i];
+ newToOld[i] = newToOld[j];
+ newToOld[j] = tmp;
+ }
+ @Override
+ protected int compare(int i, int j) {
+ // j first since we actually want higher weights first
+ return Long.compare(weights[newToOld[j]], weights[newToOld[i]]);
+ }
+ }.sort(0, weights.length);
+ return newToOld;
+ }
+
+ /** Inverse the map. */
+ private static int[] inverse(int[] map) {
+ final int[] inverse = new int[map.length];
+ for (int i = 0; i < map.length; ++i) {
+ inverse[map[i]] = i;
+ }
+ return inverse;
+ }
+
+ private final int[] newToOld, oldToNew;
+
+ SegmentMap(long[] weights) {
+ newToOld = map(weights);
+ oldToNew = inverse(newToOld);
+ assert Arrays.equals(newToOld, inverse(oldToNew));
+ }
+
+ int newToOld(int segment) {
+ return newToOld[segment];
+ }
+
+ int oldToNew(int segment) {
+ return oldToNew[segment];
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(newToOld) + RamUsageEstimator.sizeOf(oldToNew);
+ }
+
+ }
+
+ /**
+ * Create an ordinal map that uses the number of unique values of each
+ * {@link SortedDocValues} instance as a weight.
+ * @see #build(Object, TermsEnum[], long[], float)
+ */
+ public static OrdinalMap build(Object owner, SortedDocValues[] values, float acceptableOverheadRatio) throws IOException {
+ final TermsEnum[] subs = new TermsEnum[values.length];
+ final long[] weights = new long[values.length];
+ for (int i = 0; i < values.length; ++i) {
+ subs[i] = values[i].termsEnum();
+ weights[i] = values[i].getValueCount();
+ }
+ return build(owner, subs, weights, acceptableOverheadRatio);
+ }
+
+ /**
+ * Create an ordinal map that uses the number of unique values of each
+ * {@link SortedSetDocValues} instance as a weight.
+ * @see #build(Object, TermsEnum[], long[], float)
+ */
+ public static OrdinalMap build(Object owner, SortedSetDocValues[] values, float acceptableOverheadRatio) throws IOException {
+ final TermsEnum[] subs = new TermsEnum[values.length];
+ final long[] weights = new long[values.length];
+ for (int i = 0; i < values.length; ++i) {
+ subs[i] = values[i].termsEnum();
+ weights[i] = values[i].getValueCount();
+ }
+ return build(owner, subs, weights, acceptableOverheadRatio);
+ }
+
+ /**
+ * Creates an ordinal map that allows mapping ords to/from a merged
+ * space from subs
.
+ * @param owner a cache key
+ * @param subs TermsEnums that support {@link TermsEnum#ord()}. They need
+ * not be dense (e.g. can be FilteredTermsEnums}.
+ * @param weights a weight for each sub. This is ideally correlated with
+ * the number of unique terms that each sub introduces compared
+ * to the other subs
+ * @throws IOException if an I/O error occurred.
+ */
+ public static OrdinalMap build(Object owner, TermsEnum subs[], long[] weights, float acceptableOverheadRatio) throws IOException {
+ if (subs.length != weights.length) {
+ throw new IllegalArgumentException("subs and weights must have the same length");
+ }
+
+ // enums are not sorted, so let's sort to save memory
+ final SegmentMap segmentMap = new SegmentMap(weights);
+ return new OrdinalMap(owner, subs, segmentMap, acceptableOverheadRatio);
+ }
+
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OrdinalMap.class);
// cache key of whoever asked for this awful thing
@@ -390,21 +493,16 @@ public class MultiDocValues {
final AppendingPackedLongBuffer firstSegments;
// for every segment, segmentOrd -> globalOrd
final LongValues segmentToGlobalOrds[];
+ // the map from/to segment ids
+ final SegmentMap segmentMap;
// ram usage
final long ramBytesUsed;
- /**
- * Creates an ordinal map that allows mapping ords to/from a merged
- * space from subs
.
- * @param owner a cache key
- * @param subs TermsEnums that support {@link TermsEnum#ord()}. They need
- * not be dense (e.g. can be FilteredTermsEnums}.
- * @throws IOException if an I/O error occurred.
- */
- public OrdinalMap(Object owner, TermsEnum subs[], float acceptableOverheadRatio) throws IOException {
+ OrdinalMap(Object owner, TermsEnum subs[], SegmentMap segmentMap, float acceptableOverheadRatio) throws IOException {
// create the ordinal mappings by pulling a termsenum over each sub's
// unique terms, and walking a multitermsenum over those
this.owner = owner;
+ this.segmentMap = segmentMap;
// even though we accept an overhead ratio, we keep these ones with COMPACT
// since they are only used to resolve values given a global ord, which is
// slow anyway
@@ -420,7 +518,7 @@ public class MultiDocValues {
TermsEnumIndex indexes[] = new TermsEnumIndex[slices.length];
for (int i = 0; i < slices.length; i++) {
slices[i] = new ReaderSlice(0, 0, i);
- indexes[i] = new TermsEnumIndex(subs[i], i);
+ indexes[i] = new TermsEnumIndex(subs[segmentMap.newToOld(i)], i);
}
MultiTermsEnum mte = new MultiTermsEnum(slices);
mte.reset(indexes);
@@ -460,7 +558,9 @@ public class MultiDocValues {
}
// ordDeltas is typically the bottleneck, so let's see what we can do to make it faster
segmentToGlobalOrds = new LongValues[subs.length];
- long ramBytesUsed = BASE_RAM_BYTES_USED + globalOrdDeltas.ramBytesUsed() + firstSegments.ramBytesUsed() + RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds);
+ long ramBytesUsed = BASE_RAM_BYTES_USED + globalOrdDeltas.ramBytesUsed()
+ + firstSegments.ramBytesUsed() + RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds)
+ + segmentMap.ramBytesUsed();
for (int i = 0; i < ordDeltas.length; ++i) {
final MonotonicAppendingLongBuffer deltas = ordDeltas[i];
if (ordDeltaBits[i] == 0L) {
@@ -503,17 +603,12 @@ public class MultiDocValues {
this.ramBytesUsed = ramBytesUsed;
}
- /** Create an {@link OrdinalMap} with the default overhead ratio. */
- public OrdinalMap(Object owner, TermsEnum subs[]) throws IOException {
- this(owner, subs, PackedInts.DEFAULT);
- }
-
/**
* Given a segment number, return a {@link LongValues} instance that maps
* segment ordinals to global ordinals.
*/
public LongValues getGlobalOrds(int segmentIndex) {
- return segmentToGlobalOrds[segmentIndex];
+ return segmentToGlobalOrds[segmentMap.oldToNew(segmentIndex)];
}
/**
@@ -529,7 +624,7 @@ public class MultiDocValues {
* segment that contains this term.
*/
public int getFirstSegmentNumber(long globalOrd) {
- return (int) firstSegments.get(globalOrd);
+ return segmentMap.newToOld((int) firstSegments.get(globalOrd));
}
/**
@@ -559,7 +654,6 @@ public class MultiDocValues {
/** Creates a new MultiSortedDocValues over values
*/
MultiSortedDocValues(SortedDocValues values[], int docStarts[], OrdinalMap mapping) throws IOException {
- assert values.length == mapping.segmentToGlobalOrds.length;
assert docStarts.length == values.length + 1;
this.values = values;
this.docStarts = docStarts;
@@ -570,7 +664,7 @@ public class MultiDocValues {
public int getOrd(int docID) {
int subIndex = ReaderUtil.subIndex(docID, docStarts);
int segmentOrd = values[subIndex].getOrd(docID - docStarts[subIndex]);
- return segmentOrd == -1 ? segmentOrd : (int) mapping.segmentToGlobalOrds[subIndex].get(segmentOrd);
+ return segmentOrd == -1 ? segmentOrd : (int) mapping.getGlobalOrds(subIndex).get(segmentOrd);
}
@Override
@@ -598,10 +692,10 @@ public class MultiDocValues {
/** ordinal map mapping ords from values
to global ord space */
public final OrdinalMap mapping;
int currentSubIndex;
+ LongValues currentGlobalOrds;
/** Creates a new MultiSortedSetDocValues over values
*/
MultiSortedSetDocValues(SortedSetDocValues values[], int docStarts[], OrdinalMap mapping) throws IOException {
- assert values.length == mapping.segmentToGlobalOrds.length;
assert docStarts.length == values.length + 1;
this.values = values;
this.docStarts = docStarts;
@@ -614,13 +708,14 @@ public class MultiDocValues {
if (segmentOrd == NO_MORE_ORDS) {
return segmentOrd;
} else {
- return mapping.segmentToGlobalOrds[currentSubIndex].get(segmentOrd);
+ return currentGlobalOrds.get(segmentOrd);
}
}
@Override
public void setDocument(int docID) {
currentSubIndex = ReaderUtil.subIndex(docID, docStarts);
+ currentGlobalOrds = mapping.getGlobalOrds(currentSubIndex);
values[currentSubIndex].setDocument(docID - docStarts[currentSubIndex]);
}