mirror of https://github.com/apache/lucene.git
LUCENE-9536: Optimize OrdinalMap when one segment contains all distinct values. (#1948)
LUCENE-9536: Optimize OrdinalMap when one segment contains all distinct values. For doc values that are not too high cardinality, it is common for some large segments to contain all distinct values. In this case, we can check if the first segment ords map perfectly to global ords, and if so store the global ord deltas and first segment indices as `LongValues.ZEROES` to save some space.
This commit is contained in:
parent
da0004875b
commit
8f004f7a38
|
@ -172,10 +172,12 @@ public class OrdinalMap implements Accountable {
|
||||||
|
|
||||||
/** Cache key of whoever asked for this awful thing */
|
/** Cache key of whoever asked for this awful thing */
|
||||||
public final IndexReader.CacheKey owner;
|
public final IndexReader.CacheKey owner;
|
||||||
|
// number of global ordinals
|
||||||
|
final long valueCount;
|
||||||
// globalOrd -> (globalOrd - segmentOrd) where segmentOrd is the the ordinal in the first segment that contains this term
|
// globalOrd -> (globalOrd - segmentOrd) where segmentOrd is the the ordinal in the first segment that contains this term
|
||||||
final PackedLongValues globalOrdDeltas;
|
final LongValues globalOrdDeltas;
|
||||||
// globalOrd -> first segment container
|
// globalOrd -> first segment container
|
||||||
final PackedLongValues firstSegments;
|
final LongValues firstSegments;
|
||||||
// for every segment, segmentOrd -> globalOrd
|
// for every segment, segmentOrd -> globalOrd
|
||||||
final LongValues segmentToGlobalOrds[];
|
final LongValues segmentToGlobalOrds[];
|
||||||
// the map from/to segment ids
|
// the map from/to segment ids
|
||||||
|
@ -271,13 +273,25 @@ public class OrdinalMap implements Accountable {
|
||||||
globalOrd++;
|
globalOrd++;
|
||||||
}
|
}
|
||||||
|
|
||||||
this.firstSegments = firstSegments.build();
|
long ramBytesUsed = BASE_RAM_BYTES_USED + segmentMap.ramBytesUsed();
|
||||||
this.globalOrdDeltas = globalOrdDeltas.build();
|
this.valueCount = globalOrd;
|
||||||
|
|
||||||
|
// If the first segment contains all of the global ords, then we can apply a small optimization
|
||||||
|
// and hardcode the first segment indices and global ord deltas as all zeroes.
|
||||||
|
if (ordDeltaBits.length > 0 && ordDeltaBits[0] == 0L && ordDeltas[0].size() == this.valueCount) {
|
||||||
|
this.firstSegments = LongValues.ZEROES;
|
||||||
|
this.globalOrdDeltas = LongValues.ZEROES;
|
||||||
|
} else {
|
||||||
|
PackedLongValues packedFirstSegments = firstSegments.build();
|
||||||
|
PackedLongValues packedGlobalOrdDeltas = globalOrdDeltas.build();
|
||||||
|
this.firstSegments = packedFirstSegments;
|
||||||
|
this.globalOrdDeltas = packedGlobalOrdDeltas;
|
||||||
|
ramBytesUsed += packedFirstSegments.ramBytesUsed() + packedGlobalOrdDeltas.ramBytesUsed();
|
||||||
|
}
|
||||||
|
|
||||||
// ordDeltas is typically the bottleneck, so let's see what we can do to make it faster
|
// ordDeltas is typically the bottleneck, so let's see what we can do to make it faster
|
||||||
segmentToGlobalOrds = new LongValues[subs.length];
|
segmentToGlobalOrds = new LongValues[subs.length];
|
||||||
long ramBytesUsed = BASE_RAM_BYTES_USED + this.globalOrdDeltas.ramBytesUsed()
|
ramBytesUsed += RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds);
|
||||||
+ this.firstSegments.ramBytesUsed() + RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds)
|
|
||||||
+ segmentMap.ramBytesUsed();
|
|
||||||
for (int i = 0; i < ordDeltas.length; ++i) {
|
for (int i = 0; i < ordDeltas.length; ++i) {
|
||||||
final PackedLongValues deltas = ordDeltas[i].build();
|
final PackedLongValues deltas = ordDeltas[i].build();
|
||||||
if (ordDeltaBits[i] == 0L) {
|
if (ordDeltaBits[i] == 0L) {
|
||||||
|
@ -317,6 +331,7 @@ public class OrdinalMap implements Accountable {
|
||||||
ramBytesUsed += RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds[i]);
|
ramBytesUsed += RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
this.ramBytesUsed = ramBytesUsed;
|
this.ramBytesUsed = ramBytesUsed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -348,7 +363,7 @@ public class OrdinalMap implements Accountable {
|
||||||
* Returns the total number of unique terms in global ord space.
|
* Returns the total number of unique terms in global ord space.
|
||||||
*/
|
*/
|
||||||
public long getValueCount() {
|
public long getValueCount() {
|
||||||
return globalOrdDeltas.size();
|
return valueCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -359,10 +374,9 @@ public class OrdinalMap implements Accountable {
|
||||||
@Override
|
@Override
|
||||||
public Collection<Accountable> getChildResources() {
|
public Collection<Accountable> getChildResources() {
|
||||||
List<Accountable> resources = new ArrayList<>();
|
List<Accountable> resources = new ArrayList<>();
|
||||||
resources.add(Accountables.namedAccountable("global ord deltas", globalOrdDeltas));
|
|
||||||
resources.add(Accountables.namedAccountable("first segments", firstSegments));
|
|
||||||
resources.add(Accountables.namedAccountable("segment map", segmentMap));
|
resources.add(Accountables.namedAccountable("segment map", segmentMap));
|
||||||
// TODO: would be nice to return actual child segment deltas too, but the optimizations are confusing
|
// TODO: would be nice to return the ordinal and segment maps too, but it's not straightforward
|
||||||
|
// because of optimizations.
|
||||||
return resources;
|
return resources;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,10 +17,6 @@
|
||||||
package org.apache.lucene.index;
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.lang.reflect.Field;
|
|
||||||
import java.util.HashMap;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.SortedDocValuesField;
|
import org.apache.lucene.document.SortedDocValuesField;
|
||||||
|
@ -32,6 +28,10 @@ import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util.RamUsageTester;
|
import org.apache.lucene.util.RamUsageTester;
|
||||||
import org.apache.lucene.util.TestUtil;
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.lang.reflect.Field;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
public class TestOrdinalMap extends LuceneTestCase {
|
public class TestOrdinalMap extends LuceneTestCase {
|
||||||
|
|
||||||
private static final Field ORDINAL_MAP_OWNER_FIELD;
|
private static final Field ORDINAL_MAP_OWNER_FIELD;
|
||||||
|
@ -46,7 +46,7 @@ public class TestOrdinalMap extends LuceneTestCase {
|
||||||
private static final RamUsageTester.Accumulator ORDINAL_MAP_ACCUMULATOR = new RamUsageTester.Accumulator() {
|
private static final RamUsageTester.Accumulator ORDINAL_MAP_ACCUMULATOR = new RamUsageTester.Accumulator() {
|
||||||
|
|
||||||
public long accumulateObject(Object o, long shallowSize, java.util.Map<Field,Object> fieldValues, java.util.Collection<Object> queue) {
|
public long accumulateObject(Object o, long shallowSize, java.util.Map<Field,Object> fieldValues, java.util.Collection<Object> queue) {
|
||||||
if (o == LongValues.IDENTITY) {
|
if (o == LongValues.ZEROES || o == LongValues.IDENTITY) {
|
||||||
return 0L;
|
return 0L;
|
||||||
}
|
}
|
||||||
if (o instanceof OrdinalMap) {
|
if (o instanceof OrdinalMap) {
|
||||||
|
@ -95,4 +95,53 @@ public class TestOrdinalMap extends LuceneTestCase {
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests the case where one segment contains all of the global ords. In this case, we apply a
|
||||||
|
* small optimization and hardcode the first segment indices and global ord deltas as all zeroes.
|
||||||
|
*/
|
||||||
|
public void testOneSegmentWithAllValues() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig cfg = new IndexWriterConfig(new MockAnalyzer(random())).setCodec(
|
||||||
|
TestUtil.alwaysDocValuesFormat(TestUtil.getDefaultDocValuesFormat()));
|
||||||
|
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, cfg);
|
||||||
|
|
||||||
|
int numTerms = 1000;
|
||||||
|
for (int i = 0; i < numTerms; ++i) {
|
||||||
|
Document d = new Document();
|
||||||
|
String term = String.valueOf(i);
|
||||||
|
d.add(new SortedDocValuesField("sdv", new BytesRef(term)));
|
||||||
|
iw.addDocument(d);
|
||||||
|
}
|
||||||
|
iw.forceMerge(1);
|
||||||
|
|
||||||
|
for (int i = 0; i < 10; ++i) {
|
||||||
|
Document d = new Document();
|
||||||
|
String term = String.valueOf(random().nextInt(numTerms));
|
||||||
|
d.add(new SortedDocValuesField("sdv", new BytesRef(term)));
|
||||||
|
iw.addDocument(d);
|
||||||
|
}
|
||||||
|
iw.commit();
|
||||||
|
|
||||||
|
DirectoryReader r = iw.getReader();
|
||||||
|
SortedDocValues sdv = MultiDocValues.getSortedValues(r, "sdv");
|
||||||
|
assertNotNull(sdv);
|
||||||
|
assertTrue(sdv instanceof MultiDocValues.MultiSortedDocValues);
|
||||||
|
|
||||||
|
// Check that the optimization kicks in.
|
||||||
|
OrdinalMap map = ((MultiDocValues.MultiSortedDocValues) sdv).mapping;
|
||||||
|
assertEquals(LongValues.ZEROES, map.firstSegments);
|
||||||
|
assertEquals(LongValues.ZEROES, map.globalOrdDeltas);
|
||||||
|
|
||||||
|
// Check the map's basic behavior.
|
||||||
|
assertEquals(numTerms, (int) map.getValueCount());
|
||||||
|
for (int i = 0; i < numTerms; i++) {
|
||||||
|
assertEquals(0, map.getFirstSegmentNumber(i));
|
||||||
|
assertEquals(i, map.getFirstSegmentOrd(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
iw.close();
|
||||||
|
r.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue