From c33d62adbc2be5a019de1f2b5344e0ba1eb98d9a Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Wed, 19 Jun 2019 22:44:22 +0200 Subject: [PATCH] Reduce the number of docvalues iterator created in the global ordinals fielddata (#43091) Today the fielddata for global ordinals re-creates docvalues readers of each segment when building the iterator of a single segment. This is required because the lookup of global ordinals needs to access the docvalues's TermsEnum of each segment to retrieve the original terms. This also means that we need to create NxN (where N is the number of segment in the index) docvalues iterators each time we want to collect global ordinal values. This wasn't an issue in previous versions since docvalues readers are stateless before 6.0 so they are reused on each segment but now that docvalues are iterators we need to create a new instance each time we want to access the values. In order to avoid creating too many iterators this change splits the global ordinals fielddata in two classes, one that is used to cache a single instance per directory reader and one that is created from the cached instance that can be used by a single consumer. The latter creates the TermsEnum of each segment once and reuse them to create the segment's iterator. This prevents the creation of all TermsEnums each time we want to access the value of a single segment, hence reducing the number of docvalues iterator to create to Nx2 (one iterator and one lookup per segment). --- .../ordinals/GlobalOrdinalMapping.java | 12 +- .../GlobalOrdinalsIndexFieldData.java | 151 +++++++++++++----- .../plain/AbstractIndexOrdinalsFieldData.java | 12 ++ .../SortedSetDVOrdinalsIndexFieldData.java | 12 ++ .../AbstractStringFieldDataTestCase.java | 7 +- 5 files changed, 146 insertions(+), 48 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/fielddata/ordinals/GlobalOrdinalMapping.java b/server/src/main/java/org/elasticsearch/index/fielddata/ordinals/GlobalOrdinalMapping.java index 2bf2abac957..33073dff251 100644 --- a/server/src/main/java/org/elasticsearch/index/fielddata/ordinals/GlobalOrdinalMapping.java +++ b/server/src/main/java/org/elasticsearch/index/fielddata/ordinals/GlobalOrdinalMapping.java @@ -21,6 +21,7 @@ package org.elasticsearch.index.fielddata.ordinals; import org.apache.lucene.index.OrdinalMap; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LongValues; @@ -34,12 +35,12 @@ final class GlobalOrdinalMapping extends SortedSetDocValues { private final SortedSetDocValues values; private final OrdinalMap ordinalMap; private final LongValues mapping; - private final SortedSetDocValues[] bytesValues; + private final TermsEnum[] lookups; - GlobalOrdinalMapping(OrdinalMap ordinalMap, SortedSetDocValues[] bytesValues, int segmentIndex) { + GlobalOrdinalMapping(OrdinalMap ordinalMap, SortedSetDocValues values, TermsEnum[] lookups, int segmentIndex) { super(); - this.values = bytesValues[segmentIndex]; - this.bytesValues = bytesValues; + this.values = values; + this.lookups = lookups; this.ordinalMap = ordinalMap; this.mapping = ordinalMap.getGlobalOrds(segmentIndex); } @@ -72,7 +73,8 @@ final class GlobalOrdinalMapping extends SortedSetDocValues { public BytesRef lookupOrd(long globalOrd) throws IOException { final long segmentOrd = ordinalMap.getFirstSegmentOrd(globalOrd); int readerIndex = ordinalMap.getFirstSegmentNumber(globalOrd); - return bytesValues[readerIndex].lookupOrd(segmentOrd); + lookups[readerIndex].seekExact(segmentOrd); + return lookups[readerIndex].term(); } @Override diff --git a/server/src/main/java/org/elasticsearch/index/fielddata/ordinals/GlobalOrdinalsIndexFieldData.java b/server/src/main/java/org/elasticsearch/index/fielddata/ordinals/GlobalOrdinalsIndexFieldData.java index a869e5a40d4..b3c2b4c9259 100644 --- a/server/src/main/java/org/elasticsearch/index/fielddata/ordinals/GlobalOrdinalsIndexFieldData.java +++ b/server/src/main/java/org/elasticsearch/index/fielddata/ordinals/GlobalOrdinalsIndexFieldData.java @@ -22,53 +22,64 @@ import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.OrdinalMap; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.SortField; import org.apache.lucene.util.Accountable; import org.elasticsearch.common.Nullable; import org.elasticsearch.index.AbstractIndexComponent; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.fielddata.AtomicOrdinalsFieldData; -import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested; import org.elasticsearch.index.fielddata.IndexOrdinalsFieldData; import org.elasticsearch.index.fielddata.ScriptDocValues; import org.elasticsearch.index.fielddata.plain.AbstractAtomicOrdinalsFieldData; import org.elasticsearch.search.MultiValueMode; +import java.io.IOException; +import java.io.UncheckedIOException; import java.util.Collection; import java.util.Collections; import java.util.function.Function; /** - * {@link IndexFieldData} base class for concrete global ordinals implementations. + * Concrete implementation of {@link IndexOrdinalsFieldData} for global ordinals. + * A single instance of this class should be used to cache global ordinals per {@link DirectoryReader}. + * However {@link #loadGlobal(DirectoryReader)} always creates a new instance of {@link Consumer} from the cached + * value in order to reuse the segment's {@link TermsEnum} that are needed to retrieve terms from global ordinals. + * Each instance of {@link Consumer} uses a new set of {@link TermsEnum} that can be reused during the collection, + * this is done to avoid creating all segment's {@link TermsEnum} each time we want to access the values of a single + * segment. */ -public class GlobalOrdinalsIndexFieldData extends AbstractIndexComponent implements IndexOrdinalsFieldData, Accountable { +public final class GlobalOrdinalsIndexFieldData extends AbstractIndexComponent implements IndexOrdinalsFieldData, Accountable { private final String fieldName; private final long memorySizeInBytes; private final OrdinalMap ordinalMap; - private final Atomic[] atomicReaders; + private final AtomicOrdinalsFieldData[] segmentAfd; private final Function> scriptFunction; - - protected GlobalOrdinalsIndexFieldData(IndexSettings indexSettings, String fieldName, AtomicOrdinalsFieldData[] segmentAfd, - OrdinalMap ordinalMap, long memorySizeInBytes, Function> scriptFunction) { + protected GlobalOrdinalsIndexFieldData(IndexSettings indexSettings, + String fieldName, + AtomicOrdinalsFieldData[] segmentAfd, + OrdinalMap ordinalMap, + long memorySizeInBytes, + Function> scriptFunction) { super(indexSettings); this.fieldName = fieldName; this.memorySizeInBytes = memorySizeInBytes; this.ordinalMap = ordinalMap; - this.atomicReaders = new Atomic[segmentAfd.length]; - for (int i = 0; i < segmentAfd.length; i++) { - atomicReaders[i] = new Atomic(segmentAfd[i], ordinalMap, i); - } + this.segmentAfd = segmentAfd; this.scriptFunction = scriptFunction; } + public IndexOrdinalsFieldData newConsumer(DirectoryReader source) { + return new Consumer(source, indexSettings); + } + @Override public AtomicOrdinalsFieldData loadDirect(LeafReaderContext context) throws Exception { - return load(context); + throw new IllegalStateException("loadDirect(LeafReaderContext) should not be called in this context"); } @Override @@ -92,9 +103,7 @@ public class GlobalOrdinalsIndexFieldData extends AbstractIndexComponent impleme } @Override - public void clear() { - // no need to clear, because this is cached and cleared in AbstractBytesIndexFieldData - } + public void clear() {} @Override public long ramBytesUsed() { @@ -109,7 +118,7 @@ public class GlobalOrdinalsIndexFieldData extends AbstractIndexComponent impleme @Override public AtomicOrdinalsFieldData load(LeafReaderContext context) { - return atomicReaders[context.ord]; + throw new IllegalStateException("load(LeafReaderContext) should not be called in this context"); } @Override @@ -117,46 +126,108 @@ public class GlobalOrdinalsIndexFieldData extends AbstractIndexComponent impleme return ordinalMap; } - private final class Atomic extends AbstractAtomicOrdinalsFieldData { + /** + * A non-thread safe {@link IndexOrdinalsFieldData} for global ordinals that creates the {@link TermsEnum} of each + * segment once and use them to provide a single lookup per segment. + */ + public class Consumer extends AbstractIndexComponent implements IndexOrdinalsFieldData, Accountable { + private final DirectoryReader source; + private TermsEnum[] lookups; - private final AtomicOrdinalsFieldData afd; - private final OrdinalMap ordinalMap; - private final int segmentIndex; + Consumer(DirectoryReader source, IndexSettings settings) { + super(settings); + this.source = source; + } - private Atomic(AtomicOrdinalsFieldData afd, OrdinalMap ordinalMap, int segmentIndex) { - super(scriptFunction); - this.afd = afd; - this.ordinalMap = ordinalMap; - this.segmentIndex = segmentIndex; + /** + * Lazy creation of the {@link TermsEnum} for each segment present in this reader + */ + private TermsEnum[] getOrLoadTermsEnums() { + if (lookups == null) { + lookups = new TermsEnum[segmentAfd.length]; + for (int i = 0; i < lookups.length; i++) { + try { + lookups[i] = segmentAfd[i].getOrdinalsValues().termsEnum(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to load terms enum", e); + } + } + } + return lookups; } @Override - public SortedSetDocValues getOrdinalsValues() { - final SortedSetDocValues values = afd.getOrdinalsValues(); - if (values.getValueCount() == ordinalMap.getValueCount()) { - // segment ordinals match global ordinals - return values; - } - final SortedSetDocValues[] bytesValues = new SortedSetDocValues[atomicReaders.length]; - for (int i = 0; i < bytesValues.length; i++) { - bytesValues[i] = atomicReaders[i].afd.getOrdinalsValues(); - } - return new GlobalOrdinalMapping(ordinalMap, bytesValues, segmentIndex); + public AtomicOrdinalsFieldData loadDirect(LeafReaderContext context) throws Exception { + return load(context); } + @Override + public IndexOrdinalsFieldData loadGlobal(DirectoryReader indexReader) { + return this; + } + + @Override + public IndexOrdinalsFieldData localGlobalDirect(DirectoryReader indexReader) throws Exception { + return this; + } + + @Override + public String getFieldName() { + return fieldName; + } + + @Override + public SortField sortField(@Nullable Object missingValue, MultiValueMode sortMode, Nested nested, boolean reverse) { + throw new UnsupportedOperationException("no global ordinals sorting yet"); + } + + @Override + public void clear() {} + @Override public long ramBytesUsed() { - return afd.ramBytesUsed(); + return memorySizeInBytes; } @Override public Collection getChildResources() { - return afd.getChildResources(); + return Collections.emptyList(); } @Override - public void close() { + public AtomicOrdinalsFieldData load(LeafReaderContext context) { + assert source.getReaderCacheHelper().getKey() == context.parent.reader().getReaderCacheHelper().getKey(); + return new AbstractAtomicOrdinalsFieldData(scriptFunction) { + @Override + public SortedSetDocValues getOrdinalsValues() { + final SortedSetDocValues values = segmentAfd[context.ord].getOrdinalsValues(); + if (values.getValueCount() == ordinalMap.getValueCount()) { + // segment ordinals match global ordinals + return values; + } + final TermsEnum[] atomicLookups = getOrLoadTermsEnums(); + return new GlobalOrdinalMapping(ordinalMap, values, atomicLookups, context.ord); + } + + @Override + public long ramBytesUsed() { + return segmentAfd[context.ord].ramBytesUsed(); + } + + + @Override + public Collection getChildResources() { + return segmentAfd[context.ord].getChildResources(); + } + + @Override + public void close() {} + }; } + @Override + public OrdinalMap getOrdinalMap() { + return ordinalMap; + } } } diff --git a/server/src/main/java/org/elasticsearch/index/fielddata/plain/AbstractIndexOrdinalsFieldData.java b/server/src/main/java/org/elasticsearch/index/fielddata/plain/AbstractIndexOrdinalsFieldData.java index 0dc0de838a3..a7d63828138 100644 --- a/server/src/main/java/org/elasticsearch/index/fielddata/plain/AbstractIndexOrdinalsFieldData.java +++ b/server/src/main/java/org/elasticsearch/index/fielddata/plain/AbstractIndexOrdinalsFieldData.java @@ -32,6 +32,7 @@ import org.elasticsearch.index.fielddata.AtomicOrdinalsFieldData; import org.elasticsearch.index.fielddata.IndexFieldDataCache; import org.elasticsearch.index.fielddata.IndexOrdinalsFieldData; import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalsBuilder; +import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalsIndexFieldData; import org.elasticsearch.indices.breaker.CircuitBreakerService; import java.io.IOException; @@ -60,6 +61,17 @@ public abstract class AbstractIndexOrdinalsFieldData extends AbstractIndexFieldD @Override public IndexOrdinalsFieldData loadGlobal(DirectoryReader indexReader) { + IndexOrdinalsFieldData fieldData = loadGlobalInternal(indexReader); + if (fieldData instanceof GlobalOrdinalsIndexFieldData) { + // we create a new instance of the cached value for each consumer in order + // to avoid creating new TermsEnums for each segment in the cached instance + return ((GlobalOrdinalsIndexFieldData) fieldData).newConsumer(indexReader); + } else { + return fieldData; + } + } + + private IndexOrdinalsFieldData loadGlobalInternal(DirectoryReader indexReader) { if (indexReader.leaves().size() <= 1) { // ordinals are already global return this; diff --git a/server/src/main/java/org/elasticsearch/index/fielddata/plain/SortedSetDVOrdinalsIndexFieldData.java b/server/src/main/java/org/elasticsearch/index/fielddata/plain/SortedSetDVOrdinalsIndexFieldData.java index b71dcc75934..836c332a340 100644 --- a/server/src/main/java/org/elasticsearch/index/fielddata/plain/SortedSetDVOrdinalsIndexFieldData.java +++ b/server/src/main/java/org/elasticsearch/index/fielddata/plain/SortedSetDVOrdinalsIndexFieldData.java @@ -37,6 +37,7 @@ import org.elasticsearch.index.fielddata.IndexFieldDataCache; import org.elasticsearch.index.fielddata.IndexOrdinalsFieldData; import org.elasticsearch.index.fielddata.ScriptDocValues; import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource; +import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalsIndexFieldData; import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalsBuilder; import org.elasticsearch.indices.breaker.CircuitBreakerService; import org.elasticsearch.search.MultiValueMode; @@ -92,6 +93,17 @@ public class SortedSetDVOrdinalsIndexFieldData extends DocValuesIndexFieldData i @Override public IndexOrdinalsFieldData loadGlobal(DirectoryReader indexReader) { + IndexOrdinalsFieldData fieldData = loadGlobalInternal(indexReader); + if (fieldData instanceof GlobalOrdinalsIndexFieldData) { + // we create a new instance of the cached value for each consumer in order + // to avoid creating new TermsEnums for each segment in the cached instance + return ((GlobalOrdinalsIndexFieldData) fieldData).newConsumer(indexReader); + } else { + return fieldData; + } + } + + private IndexOrdinalsFieldData loadGlobalInternal(DirectoryReader indexReader) { if (indexReader.leaves().size() <= 1) { // ordinals are already global return this; diff --git a/server/src/test/java/org/elasticsearch/index/fielddata/AbstractStringFieldDataTestCase.java b/server/src/test/java/org/elasticsearch/index/fielddata/AbstractStringFieldDataTestCase.java index 21d84203f6d..1539a800f2a 100644 --- a/server/src/test/java/org/elasticsearch/index/fielddata/AbstractStringFieldDataTestCase.java +++ b/server/src/test/java/org/elasticsearch/index/fielddata/AbstractStringFieldDataTestCase.java @@ -465,7 +465,7 @@ public abstract class AbstractStringFieldDataTestCase extends AbstractFieldDataI assertThat(topLevelReader.leaves().size(), equalTo(3)); // First segment - assertThat(globalOrdinals, instanceOf(GlobalOrdinalsIndexFieldData.class)); + assertThat(globalOrdinals, instanceOf(GlobalOrdinalsIndexFieldData.Consumer.class)); LeafReaderContext leaf = topLevelReader.leaves().get(0); AtomicOrdinalsFieldData afd = globalOrdinals.load(leaf); SortedSetDocValues values = afd.getOrdinalsValues(); @@ -590,7 +590,7 @@ public abstract class AbstractStringFieldDataTestCase extends AbstractFieldDataI IndexOrdinalsFieldData ifd = getForField("string", "value", hasDocValues()); IndexOrdinalsFieldData globalOrdinals = ifd.loadGlobal(topLevelReader); assertNotNull(globalOrdinals.getOrdinalMap()); - assertThat(ifd.loadGlobal(topLevelReader), sameInstance(globalOrdinals)); + assertThat(ifd.loadGlobal(topLevelReader).getOrdinalMap(), sameInstance(globalOrdinals.getOrdinalMap())); // 3 b/c 1 segment level caches and 1 top level cache // in case of doc values, we don't cache atomic FD, so only the top-level cache is there assertThat(indicesFieldDataCache.getCache().weight(), equalTo(hasDocValues() ? 1L : 4L)); @@ -602,7 +602,8 @@ public abstract class AbstractStringFieldDataTestCase extends AbstractFieldDataI break; } } - assertThat(cachedInstance, sameInstance(globalOrdinals)); + assertNotSame(cachedInstance, globalOrdinals); + assertThat(cachedInstance.getOrdinalMap(), sameInstance(globalOrdinals.getOrdinalMap())); topLevelReader.close(); // Now only 3 segment level entries, only the toplevel reader has been closed, but the segment readers are still used by IW assertThat(indicesFieldDataCache.getCache().weight(), equalTo(hasDocValues() ? 0L : 3L));