mirror of https://github.com/apache/lucene.git
LUCENE-5782: Improve OrdinalMap compression by sorting the supplied terms enums
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1604387 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
532d9f246b
commit
e98bcb8254
|
@ -110,6 +110,9 @@ Optimizations
|
|||
* LUCENE-5780: Make OrdinalMap more memory-efficient, especially in case the
|
||||
first segment has all values. (Adrien Grand, Robert Muir)
|
||||
|
||||
* LUCENE-5782: OrdinalMap now sorts enums before being built in order to
|
||||
improve compression. (Adrien Grand)
|
||||
|
||||
======================= Lucene 4.9.0 =======================
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
|
|
@ -40,6 +40,7 @@ import org.apache.lucene.util.Bits;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LongBitSet;
|
||||
import org.apache.lucene.util.LongValues;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Abstract API that consumes numeric, binary and
|
||||
|
@ -440,12 +441,14 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
|
||||
// step 1: iterate thru each sub and mark terms still in use
|
||||
TermsEnum liveTerms[] = new TermsEnum[dvs.length];
|
||||
long[] weights = new long[liveTerms.length];
|
||||
for (int sub = 0; sub < liveTerms.length; sub++) {
|
||||
AtomicReader reader = readers[sub];
|
||||
SortedDocValues dv = dvs[sub];
|
||||
Bits liveDocs = reader.getLiveDocs();
|
||||
if (liveDocs == null) {
|
||||
liveTerms[sub] = dv.termsEnum();
|
||||
weights[sub] = dv.getValueCount();
|
||||
} else {
|
||||
LongBitSet bitset = new LongBitSet(dv.getValueCount());
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
|
@ -457,11 +460,12 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
}
|
||||
}
|
||||
liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
|
||||
weights[sub] = bitset.cardinality();
|
||||
}
|
||||
}
|
||||
|
||||
// step 2: create ordinal map (this conceptually does the "merging")
|
||||
final OrdinalMap map = new OrdinalMap(this, liveTerms);
|
||||
final OrdinalMap map = OrdinalMap.build(this, liveTerms, weights, PackedInts.COMPACT);
|
||||
|
||||
// step 3: add field
|
||||
addSortedField(fieldInfo,
|
||||
|
@ -576,12 +580,14 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
|
||||
// step 1: iterate thru each sub and mark terms still in use
|
||||
TermsEnum liveTerms[] = new TermsEnum[dvs.length];
|
||||
long[] weights = new long[liveTerms.length];
|
||||
for (int sub = 0; sub < liveTerms.length; sub++) {
|
||||
AtomicReader reader = readers[sub];
|
||||
SortedSetDocValues dv = dvs[sub];
|
||||
Bits liveDocs = reader.getLiveDocs();
|
||||
if (liveDocs == null) {
|
||||
liveTerms[sub] = dv.termsEnum();
|
||||
weights[sub] = dv.getValueCount();
|
||||
} else {
|
||||
LongBitSet bitset = new LongBitSet(dv.getValueCount());
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
|
@ -594,11 +600,12 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
}
|
||||
}
|
||||
liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
|
||||
weights[sub] = bitset.cardinality();
|
||||
}
|
||||
}
|
||||
|
||||
// step 2: create ordinal map (this conceptually does the "merging")
|
||||
final OrdinalMap map = new OrdinalMap(this, liveTerms);
|
||||
final OrdinalMap map = OrdinalMap.build(this, liveTerms, weights, PackedInts.COMPACT);
|
||||
|
||||
// step 3: add field
|
||||
addSortedSetField(fieldInfo,
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.index;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.MultiTermsEnum.TermsEnumIndex;
|
||||
|
@ -25,6 +26,7 @@ import org.apache.lucene.index.MultiTermsEnum.TermsEnumWithSlice;
|
|||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.InPlaceMergeSorter;
|
||||
import org.apache.lucene.util.LongValues;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.packed.AppendingPackedLongBuffer;
|
||||
|
@ -322,11 +324,7 @@ public class MultiDocValues {
|
|||
if (!anyReal) {
|
||||
return null;
|
||||
} else {
|
||||
TermsEnum enums[] = new TermsEnum[values.length];
|
||||
for (int i = 0; i < values.length; i++) {
|
||||
enums[i] = values[i].termsEnum();
|
||||
}
|
||||
OrdinalMap mapping = new OrdinalMap(r.getCoreCacheKey(), enums);
|
||||
OrdinalMap mapping = OrdinalMap.build(r.getCoreCacheKey(), values, PackedInts.DEFAULT);
|
||||
return new MultiSortedDocValues(values, starts, mapping);
|
||||
}
|
||||
}
|
||||
|
@ -366,20 +364,125 @@ public class MultiDocValues {
|
|||
if (!anyReal) {
|
||||
return null;
|
||||
} else {
|
||||
TermsEnum enums[] = new TermsEnum[values.length];
|
||||
for (int i = 0; i < values.length; i++) {
|
||||
enums[i] = values[i].termsEnum();
|
||||
}
|
||||
OrdinalMap mapping = new OrdinalMap(r.getCoreCacheKey(), enums);
|
||||
OrdinalMap mapping = OrdinalMap.build(r.getCoreCacheKey(), values, PackedInts.DEFAULT);
|
||||
return new MultiSortedSetDocValues(values, starts, mapping);
|
||||
}
|
||||
}
|
||||
|
||||
/** maps per-segment ordinals to/from global ordinal space */
|
||||
// TODO: we could also have a utility method to merge Terms[] and use size() as a weight when we need it
|
||||
// TODO: use more efficient packed ints structures?
|
||||
// TODO: pull this out? its pretty generic (maps between N ord()-enabled TermsEnums)
|
||||
public static class OrdinalMap implements Accountable {
|
||||
|
||||
private static class SegmentMap implements Accountable {
|
||||
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(SegmentMap.class);
|
||||
|
||||
/** Build a map from an index into a sorted view of `weights` to an index into `weights`. */
|
||||
private static int[] map(final long[] weights) {
|
||||
final int[] newToOld = new int[weights.length];
|
||||
for (int i = 0; i < weights.length; ++i) {
|
||||
newToOld[i] = i;
|
||||
}
|
||||
new InPlaceMergeSorter() {
|
||||
@Override
|
||||
protected void swap(int i, int j) {
|
||||
final int tmp = newToOld[i];
|
||||
newToOld[i] = newToOld[j];
|
||||
newToOld[j] = tmp;
|
||||
}
|
||||
@Override
|
||||
protected int compare(int i, int j) {
|
||||
// j first since we actually want higher weights first
|
||||
return Long.compare(weights[newToOld[j]], weights[newToOld[i]]);
|
||||
}
|
||||
}.sort(0, weights.length);
|
||||
return newToOld;
|
||||
}
|
||||
|
||||
/** Inverse the map. */
|
||||
private static int[] inverse(int[] map) {
|
||||
final int[] inverse = new int[map.length];
|
||||
for (int i = 0; i < map.length; ++i) {
|
||||
inverse[map[i]] = i;
|
||||
}
|
||||
return inverse;
|
||||
}
|
||||
|
||||
private final int[] newToOld, oldToNew;
|
||||
|
||||
SegmentMap(long[] weights) {
|
||||
newToOld = map(weights);
|
||||
oldToNew = inverse(newToOld);
|
||||
assert Arrays.equals(newToOld, inverse(oldToNew));
|
||||
}
|
||||
|
||||
int newToOld(int segment) {
|
||||
return newToOld[segment];
|
||||
}
|
||||
|
||||
int oldToNew(int segment) {
|
||||
return oldToNew[segment];
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(newToOld) + RamUsageEstimator.sizeOf(oldToNew);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an ordinal map that uses the number of unique values of each
|
||||
* {@link SortedDocValues} instance as a weight.
|
||||
* @see #build(Object, TermsEnum[], long[], float)
|
||||
*/
|
||||
public static OrdinalMap build(Object owner, SortedDocValues[] values, float acceptableOverheadRatio) throws IOException {
|
||||
final TermsEnum[] subs = new TermsEnum[values.length];
|
||||
final long[] weights = new long[values.length];
|
||||
for (int i = 0; i < values.length; ++i) {
|
||||
subs[i] = values[i].termsEnum();
|
||||
weights[i] = values[i].getValueCount();
|
||||
}
|
||||
return build(owner, subs, weights, acceptableOverheadRatio);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an ordinal map that uses the number of unique values of each
|
||||
* {@link SortedSetDocValues} instance as a weight.
|
||||
* @see #build(Object, TermsEnum[], long[], float)
|
||||
*/
|
||||
public static OrdinalMap build(Object owner, SortedSetDocValues[] values, float acceptableOverheadRatio) throws IOException {
|
||||
final TermsEnum[] subs = new TermsEnum[values.length];
|
||||
final long[] weights = new long[values.length];
|
||||
for (int i = 0; i < values.length; ++i) {
|
||||
subs[i] = values[i].termsEnum();
|
||||
weights[i] = values[i].getValueCount();
|
||||
}
|
||||
return build(owner, subs, weights, acceptableOverheadRatio);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an ordinal map that allows mapping ords to/from a merged
|
||||
* space from <code>subs</code>.
|
||||
* @param owner a cache key
|
||||
* @param subs TermsEnums that support {@link TermsEnum#ord()}. They need
|
||||
* not be dense (e.g. can be FilteredTermsEnums}.
|
||||
* @param weights a weight for each sub. This is ideally correlated with
|
||||
* the number of unique terms that each sub introduces compared
|
||||
* to the other subs
|
||||
* @throws IOException if an I/O error occurred.
|
||||
*/
|
||||
public static OrdinalMap build(Object owner, TermsEnum subs[], long[] weights, float acceptableOverheadRatio) throws IOException {
|
||||
if (subs.length != weights.length) {
|
||||
throw new IllegalArgumentException("subs and weights must have the same length");
|
||||
}
|
||||
|
||||
// enums are not sorted, so let's sort to save memory
|
||||
final SegmentMap segmentMap = new SegmentMap(weights);
|
||||
return new OrdinalMap(owner, subs, segmentMap, acceptableOverheadRatio);
|
||||
}
|
||||
|
||||
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OrdinalMap.class);
|
||||
|
||||
// cache key of whoever asked for this awful thing
|
||||
|
@ -390,21 +493,16 @@ public class MultiDocValues {
|
|||
final AppendingPackedLongBuffer firstSegments;
|
||||
// for every segment, segmentOrd -> globalOrd
|
||||
final LongValues segmentToGlobalOrds[];
|
||||
// the map from/to segment ids
|
||||
final SegmentMap segmentMap;
|
||||
// ram usage
|
||||
final long ramBytesUsed;
|
||||
|
||||
/**
|
||||
* Creates an ordinal map that allows mapping ords to/from a merged
|
||||
* space from <code>subs</code>.
|
||||
* @param owner a cache key
|
||||
* @param subs TermsEnums that support {@link TermsEnum#ord()}. They need
|
||||
* not be dense (e.g. can be FilteredTermsEnums}.
|
||||
* @throws IOException if an I/O error occurred.
|
||||
*/
|
||||
public OrdinalMap(Object owner, TermsEnum subs[], float acceptableOverheadRatio) throws IOException {
|
||||
OrdinalMap(Object owner, TermsEnum subs[], SegmentMap segmentMap, float acceptableOverheadRatio) throws IOException {
|
||||
// create the ordinal mappings by pulling a termsenum over each sub's
|
||||
// unique terms, and walking a multitermsenum over those
|
||||
this.owner = owner;
|
||||
this.segmentMap = segmentMap;
|
||||
// even though we accept an overhead ratio, we keep these ones with COMPACT
|
||||
// since they are only used to resolve values given a global ord, which is
|
||||
// slow anyway
|
||||
|
@ -420,7 +518,7 @@ public class MultiDocValues {
|
|||
TermsEnumIndex indexes[] = new TermsEnumIndex[slices.length];
|
||||
for (int i = 0; i < slices.length; i++) {
|
||||
slices[i] = new ReaderSlice(0, 0, i);
|
||||
indexes[i] = new TermsEnumIndex(subs[i], i);
|
||||
indexes[i] = new TermsEnumIndex(subs[segmentMap.newToOld(i)], i);
|
||||
}
|
||||
MultiTermsEnum mte = new MultiTermsEnum(slices);
|
||||
mte.reset(indexes);
|
||||
|
@ -460,7 +558,9 @@ public class MultiDocValues {
|
|||
}
|
||||
// ordDeltas is typically the bottleneck, so let's see what we can do to make it faster
|
||||
segmentToGlobalOrds = new LongValues[subs.length];
|
||||
long ramBytesUsed = BASE_RAM_BYTES_USED + globalOrdDeltas.ramBytesUsed() + firstSegments.ramBytesUsed() + RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds);
|
||||
long ramBytesUsed = BASE_RAM_BYTES_USED + globalOrdDeltas.ramBytesUsed()
|
||||
+ firstSegments.ramBytesUsed() + RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds)
|
||||
+ segmentMap.ramBytesUsed();
|
||||
for (int i = 0; i < ordDeltas.length; ++i) {
|
||||
final MonotonicAppendingLongBuffer deltas = ordDeltas[i];
|
||||
if (ordDeltaBits[i] == 0L) {
|
||||
|
@ -503,17 +603,12 @@ public class MultiDocValues {
|
|||
this.ramBytesUsed = ramBytesUsed;
|
||||
}
|
||||
|
||||
/** Create an {@link OrdinalMap} with the default overhead ratio. */
|
||||
public OrdinalMap(Object owner, TermsEnum subs[]) throws IOException {
|
||||
this(owner, subs, PackedInts.DEFAULT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a segment number, return a {@link LongValues} instance that maps
|
||||
* segment ordinals to global ordinals.
|
||||
*/
|
||||
public LongValues getGlobalOrds(int segmentIndex) {
|
||||
return segmentToGlobalOrds[segmentIndex];
|
||||
return segmentToGlobalOrds[segmentMap.oldToNew(segmentIndex)];
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -529,7 +624,7 @@ public class MultiDocValues {
|
|||
* segment that contains this term.
|
||||
*/
|
||||
public int getFirstSegmentNumber(long globalOrd) {
|
||||
return (int) firstSegments.get(globalOrd);
|
||||
return segmentMap.newToOld((int) firstSegments.get(globalOrd));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -559,7 +654,6 @@ public class MultiDocValues {
|
|||
|
||||
/** Creates a new MultiSortedDocValues over <code>values</code> */
|
||||
MultiSortedDocValues(SortedDocValues values[], int docStarts[], OrdinalMap mapping) throws IOException {
|
||||
assert values.length == mapping.segmentToGlobalOrds.length;
|
||||
assert docStarts.length == values.length + 1;
|
||||
this.values = values;
|
||||
this.docStarts = docStarts;
|
||||
|
@ -570,7 +664,7 @@ public class MultiDocValues {
|
|||
public int getOrd(int docID) {
|
||||
int subIndex = ReaderUtil.subIndex(docID, docStarts);
|
||||
int segmentOrd = values[subIndex].getOrd(docID - docStarts[subIndex]);
|
||||
return segmentOrd == -1 ? segmentOrd : (int) mapping.segmentToGlobalOrds[subIndex].get(segmentOrd);
|
||||
return segmentOrd == -1 ? segmentOrd : (int) mapping.getGlobalOrds(subIndex).get(segmentOrd);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -598,10 +692,10 @@ public class MultiDocValues {
|
|||
/** ordinal map mapping ords from <code>values</code> to global ord space */
|
||||
public final OrdinalMap mapping;
|
||||
int currentSubIndex;
|
||||
LongValues currentGlobalOrds;
|
||||
|
||||
/** Creates a new MultiSortedSetDocValues over <code>values</code> */
|
||||
MultiSortedSetDocValues(SortedSetDocValues values[], int docStarts[], OrdinalMap mapping) throws IOException {
|
||||
assert values.length == mapping.segmentToGlobalOrds.length;
|
||||
assert docStarts.length == values.length + 1;
|
||||
this.values = values;
|
||||
this.docStarts = docStarts;
|
||||
|
@ -614,13 +708,14 @@ public class MultiDocValues {
|
|||
if (segmentOrd == NO_MORE_ORDS) {
|
||||
return segmentOrd;
|
||||
} else {
|
||||
return mapping.segmentToGlobalOrds[currentSubIndex].get(segmentOrd);
|
||||
return currentGlobalOrds.get(segmentOrd);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setDocument(int docID) {
|
||||
currentSubIndex = ReaderUtil.subIndex(docID, docStarts);
|
||||
currentGlobalOrds = mapping.getGlobalOrds(currentSubIndex);
|
||||
values[currentSubIndex].setDocument(docID - docStarts[currentSubIndex]);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue