diff --git a/lucene/core/src/java/org/apache/lucene/index/OrdinalMap.java b/lucene/core/src/java/org/apache/lucene/index/OrdinalMap.java index bbb643f4e14..1e73f1cf0f5 100644 --- a/lucene/core/src/java/org/apache/lucene/index/OrdinalMap.java +++ b/lucene/core/src/java/org/apache/lucene/index/OrdinalMap.java @@ -172,10 +172,12 @@ public class OrdinalMap implements Accountable { /** Cache key of whoever asked for this awful thing */ public final IndexReader.CacheKey owner; + // number of global ordinals + final long valueCount; // globalOrd -> (globalOrd - segmentOrd) where segmentOrd is the the ordinal in the first segment that contains this term - final PackedLongValues globalOrdDeltas; + final LongValues globalOrdDeltas; // globalOrd -> first segment container - final PackedLongValues firstSegments; + final LongValues firstSegments; // for every segment, segmentOrd -> globalOrd final LongValues segmentToGlobalOrds[]; // the map from/to segment ids @@ -271,13 +273,25 @@ public class OrdinalMap implements Accountable { globalOrd++; } - this.firstSegments = firstSegments.build(); - this.globalOrdDeltas = globalOrdDeltas.build(); + long ramBytesUsed = BASE_RAM_BYTES_USED + segmentMap.ramBytesUsed(); + this.valueCount = globalOrd; + + // If the first segment contains all of the global ords, then we can apply a small optimization + // and hardcode the first segment indices and global ord deltas as all zeroes. + if (ordDeltaBits.length > 0 && ordDeltaBits[0] == 0L && ordDeltas[0].size() == this.valueCount) { + this.firstSegments = LongValues.ZEROES; + this.globalOrdDeltas = LongValues.ZEROES; + } else { + PackedLongValues packedFirstSegments = firstSegments.build(); + PackedLongValues packedGlobalOrdDeltas = globalOrdDeltas.build(); + this.firstSegments = packedFirstSegments; + this.globalOrdDeltas = packedGlobalOrdDeltas; + ramBytesUsed += packedFirstSegments.ramBytesUsed() + packedGlobalOrdDeltas.ramBytesUsed(); + } + // ordDeltas is typically the bottleneck, so let's see what we can do to make it faster segmentToGlobalOrds = new LongValues[subs.length]; - long ramBytesUsed = BASE_RAM_BYTES_USED + this.globalOrdDeltas.ramBytesUsed() - + this.firstSegments.ramBytesUsed() + RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds) - + segmentMap.ramBytesUsed(); + ramBytesUsed += RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds); for (int i = 0; i < ordDeltas.length; ++i) { final PackedLongValues deltas = ordDeltas[i].build(); if (ordDeltaBits[i] == 0L) { @@ -317,6 +331,7 @@ public class OrdinalMap implements Accountable { ramBytesUsed += RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds[i]); } } + this.ramBytesUsed = ramBytesUsed; } @@ -348,7 +363,7 @@ public class OrdinalMap implements Accountable { * Returns the total number of unique terms in global ord space. */ public long getValueCount() { - return globalOrdDeltas.size(); + return valueCount; } @Override @@ -359,10 +374,9 @@ public class OrdinalMap implements Accountable { @Override public Collection getChildResources() { List resources = new ArrayList<>(); - resources.add(Accountables.namedAccountable("global ord deltas", globalOrdDeltas)); - resources.add(Accountables.namedAccountable("first segments", firstSegments)); resources.add(Accountables.namedAccountable("segment map", segmentMap)); - // TODO: would be nice to return actual child segment deltas too, but the optimizations are confusing + // TODO: would be nice to return the ordinal and segment maps too, but it's not straightforward + // because of optimizations. return resources; } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestOrdinalMap.java b/lucene/core/src/test/org/apache/lucene/index/TestOrdinalMap.java index 2b2ec3d5976..e7b2777778c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestOrdinalMap.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestOrdinalMap.java @@ -17,10 +17,6 @@ package org.apache.lucene.index; -import java.io.IOException; -import java.lang.reflect.Field; -import java.util.HashMap; - import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.SortedDocValuesField; @@ -32,6 +28,10 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.RamUsageTester; import org.apache.lucene.util.TestUtil; +import java.io.IOException; +import java.lang.reflect.Field; +import java.util.HashMap; + public class TestOrdinalMap extends LuceneTestCase { private static final Field ORDINAL_MAP_OWNER_FIELD; @@ -46,7 +46,7 @@ public class TestOrdinalMap extends LuceneTestCase { private static final RamUsageTester.Accumulator ORDINAL_MAP_ACCUMULATOR = new RamUsageTester.Accumulator() { public long accumulateObject(Object o, long shallowSize, java.util.Map fieldValues, java.util.Collection queue) { - if (o == LongValues.IDENTITY) { + if (o == LongValues.ZEROES || o == LongValues.IDENTITY) { return 0L; } if (o instanceof OrdinalMap) { @@ -95,4 +95,53 @@ public class TestOrdinalMap extends LuceneTestCase { dir.close(); } + /** + * Tests the case where one segment contains all of the global ords. In this case, we apply a + * small optimization and hardcode the first segment indices and global ord deltas as all zeroes. + */ + public void testOneSegmentWithAllValues() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig cfg = new IndexWriterConfig(new MockAnalyzer(random())).setCodec( + TestUtil.alwaysDocValuesFormat(TestUtil.getDefaultDocValuesFormat())); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, cfg); + + int numTerms = 1000; + for (int i = 0; i < numTerms; ++i) { + Document d = new Document(); + String term = String.valueOf(i); + d.add(new SortedDocValuesField("sdv", new BytesRef(term))); + iw.addDocument(d); + } + iw.forceMerge(1); + + for (int i = 0; i < 10; ++i) { + Document d = new Document(); + String term = String.valueOf(random().nextInt(numTerms)); + d.add(new SortedDocValuesField("sdv", new BytesRef(term))); + iw.addDocument(d); + } + iw.commit(); + + DirectoryReader r = iw.getReader(); + SortedDocValues sdv = MultiDocValues.getSortedValues(r, "sdv"); + assertNotNull(sdv); + assertTrue(sdv instanceof MultiDocValues.MultiSortedDocValues); + + // Check that the optimization kicks in. + OrdinalMap map = ((MultiDocValues.MultiSortedDocValues) sdv).mapping; + assertEquals(LongValues.ZEROES, map.firstSegments); + assertEquals(LongValues.ZEROES, map.globalOrdDeltas); + + // Check the map's basic behavior. + assertEquals(numTerms, (int) map.getValueCount()); + for (int i = 0; i < numTerms; i++) { + assertEquals(0, map.getFirstSegmentNumber(i)); + assertEquals(i, map.getFirstSegmentOrd(i)); + } + + iw.close(); + r.close(); + dir.close(); + } + }