Reduce the number of docvalues iterator created in the global ordinals fielddata (#43091)

Today the fielddata for global ordinals re-creates docvalues readers of each segment
when building the iterator of a single segment. This is required because the lookup of
global ordinals needs to access the docvalues's TermsEnum of each segment to retrieve
the original terms. This also means that we need to create NxN (where N is the number of segment in the index) docvalues iterators
each time we want to collect global ordinal values. This wasn't an issue in previous versions since docvalues readers are stateless
before 6.0 so they are reused on each segment but now that docvalues are iterators we need to create a new instance each time
we want to access the values. In order to avoid creating too many iterators this change splits
the global ordinals fielddata in two classes, one that is used to cache a single instance per directory reader and one
that is created from the cached instance that can be used by a single consumer. The latter creates the TermsEnum of each segment
once and reuse them to create the segment's iterator. This prevents the creation of all TermsEnums each time we want to access
the value of a single segment, hence reducing the number of docvalues iterator to create to Nx2 (one iterator and one lookup per segment).
This commit is contained in:
Jim Ferenczi 2019-06-19 22:44:22 +02:00 committed by jimczi
parent 1f1a035def
commit c33d62adbc
5 changed files with 146 additions and 48 deletions

View File

@ -21,6 +21,7 @@ package org.elasticsearch.index.fielddata.ordinals;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongValues;
@ -34,12 +35,12 @@ final class GlobalOrdinalMapping extends SortedSetDocValues {
private final SortedSetDocValues values;
private final OrdinalMap ordinalMap;
private final LongValues mapping;
private final SortedSetDocValues[] bytesValues;
private final TermsEnum[] lookups;
GlobalOrdinalMapping(OrdinalMap ordinalMap, SortedSetDocValues[] bytesValues, int segmentIndex) {
GlobalOrdinalMapping(OrdinalMap ordinalMap, SortedSetDocValues values, TermsEnum[] lookups, int segmentIndex) {
super();
this.values = bytesValues[segmentIndex];
this.bytesValues = bytesValues;
this.values = values;
this.lookups = lookups;
this.ordinalMap = ordinalMap;
this.mapping = ordinalMap.getGlobalOrds(segmentIndex);
}
@ -72,7 +73,8 @@ final class GlobalOrdinalMapping extends SortedSetDocValues {
public BytesRef lookupOrd(long globalOrd) throws IOException {
final long segmentOrd = ordinalMap.getFirstSegmentOrd(globalOrd);
int readerIndex = ordinalMap.getFirstSegmentNumber(globalOrd);
return bytesValues[readerIndex].lookupOrd(segmentOrd);
lookups[readerIndex].seekExact(segmentOrd);
return lookups[readerIndex].term();
}
@Override

View File

@ -22,53 +22,64 @@ import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.Accountable;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.index.AbstractIndexComponent;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.fielddata.AtomicOrdinalsFieldData;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
import org.elasticsearch.index.fielddata.IndexOrdinalsFieldData;
import org.elasticsearch.index.fielddata.ScriptDocValues;
import org.elasticsearch.index.fielddata.plain.AbstractAtomicOrdinalsFieldData;
import org.elasticsearch.search.MultiValueMode;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Collection;
import java.util.Collections;
import java.util.function.Function;
/**
* {@link IndexFieldData} base class for concrete global ordinals implementations.
* Concrete implementation of {@link IndexOrdinalsFieldData} for global ordinals.
* A single instance of this class should be used to cache global ordinals per {@link DirectoryReader}.
* However {@link #loadGlobal(DirectoryReader)} always creates a new instance of {@link Consumer} from the cached
* value in order to reuse the segment's {@link TermsEnum} that are needed to retrieve terms from global ordinals.
* Each instance of {@link Consumer} uses a new set of {@link TermsEnum} that can be reused during the collection,
* this is done to avoid creating all segment's {@link TermsEnum} each time we want to access the values of a single
* segment.
*/
public class GlobalOrdinalsIndexFieldData extends AbstractIndexComponent implements IndexOrdinalsFieldData, Accountable {
public final class GlobalOrdinalsIndexFieldData extends AbstractIndexComponent implements IndexOrdinalsFieldData, Accountable {
private final String fieldName;
private final long memorySizeInBytes;
private final OrdinalMap ordinalMap;
private final Atomic[] atomicReaders;
private final AtomicOrdinalsFieldData[] segmentAfd;
private final Function<SortedSetDocValues, ScriptDocValues<?>> scriptFunction;
protected GlobalOrdinalsIndexFieldData(IndexSettings indexSettings, String fieldName, AtomicOrdinalsFieldData[] segmentAfd,
OrdinalMap ordinalMap, long memorySizeInBytes, Function<SortedSetDocValues,
ScriptDocValues<?>> scriptFunction) {
protected GlobalOrdinalsIndexFieldData(IndexSettings indexSettings,
String fieldName,
AtomicOrdinalsFieldData[] segmentAfd,
OrdinalMap ordinalMap,
long memorySizeInBytes,
Function<SortedSetDocValues, ScriptDocValues<?>> scriptFunction) {
super(indexSettings);
this.fieldName = fieldName;
this.memorySizeInBytes = memorySizeInBytes;
this.ordinalMap = ordinalMap;
this.atomicReaders = new Atomic[segmentAfd.length];
for (int i = 0; i < segmentAfd.length; i++) {
atomicReaders[i] = new Atomic(segmentAfd[i], ordinalMap, i);
}
this.segmentAfd = segmentAfd;
this.scriptFunction = scriptFunction;
}
public IndexOrdinalsFieldData newConsumer(DirectoryReader source) {
return new Consumer(source, indexSettings);
}
@Override
public AtomicOrdinalsFieldData loadDirect(LeafReaderContext context) throws Exception {
return load(context);
throw new IllegalStateException("loadDirect(LeafReaderContext) should not be called in this context");
}
@Override
@ -92,9 +103,7 @@ public class GlobalOrdinalsIndexFieldData extends AbstractIndexComponent impleme
}
@Override
public void clear() {
// no need to clear, because this is cached and cleared in AbstractBytesIndexFieldData
}
public void clear() {}
@Override
public long ramBytesUsed() {
@ -109,7 +118,7 @@ public class GlobalOrdinalsIndexFieldData extends AbstractIndexComponent impleme
@Override
public AtomicOrdinalsFieldData load(LeafReaderContext context) {
return atomicReaders[context.ord];
throw new IllegalStateException("load(LeafReaderContext) should not be called in this context");
}
@Override
@ -117,46 +126,108 @@ public class GlobalOrdinalsIndexFieldData extends AbstractIndexComponent impleme
return ordinalMap;
}
private final class Atomic extends AbstractAtomicOrdinalsFieldData {
/**
* A non-thread safe {@link IndexOrdinalsFieldData} for global ordinals that creates the {@link TermsEnum} of each
* segment once and use them to provide a single lookup per segment.
*/
public class Consumer extends AbstractIndexComponent implements IndexOrdinalsFieldData, Accountable {
private final DirectoryReader source;
private TermsEnum[] lookups;
private final AtomicOrdinalsFieldData afd;
private final OrdinalMap ordinalMap;
private final int segmentIndex;
Consumer(DirectoryReader source, IndexSettings settings) {
super(settings);
this.source = source;
}
private Atomic(AtomicOrdinalsFieldData afd, OrdinalMap ordinalMap, int segmentIndex) {
super(scriptFunction);
this.afd = afd;
this.ordinalMap = ordinalMap;
this.segmentIndex = segmentIndex;
/**
* Lazy creation of the {@link TermsEnum} for each segment present in this reader
*/
private TermsEnum[] getOrLoadTermsEnums() {
if (lookups == null) {
lookups = new TermsEnum[segmentAfd.length];
for (int i = 0; i < lookups.length; i++) {
try {
lookups[i] = segmentAfd[i].getOrdinalsValues().termsEnum();
} catch (IOException e) {
throw new UncheckedIOException("Failed to load terms enum", e);
}
}
}
return lookups;
}
@Override
public SortedSetDocValues getOrdinalsValues() {
final SortedSetDocValues values = afd.getOrdinalsValues();
if (values.getValueCount() == ordinalMap.getValueCount()) {
// segment ordinals match global ordinals
return values;
}
final SortedSetDocValues[] bytesValues = new SortedSetDocValues[atomicReaders.length];
for (int i = 0; i < bytesValues.length; i++) {
bytesValues[i] = atomicReaders[i].afd.getOrdinalsValues();
}
return new GlobalOrdinalMapping(ordinalMap, bytesValues, segmentIndex);
public AtomicOrdinalsFieldData loadDirect(LeafReaderContext context) throws Exception {
return load(context);
}
@Override
public IndexOrdinalsFieldData loadGlobal(DirectoryReader indexReader) {
return this;
}
@Override
public IndexOrdinalsFieldData localGlobalDirect(DirectoryReader indexReader) throws Exception {
return this;
}
@Override
public String getFieldName() {
return fieldName;
}
@Override
public SortField sortField(@Nullable Object missingValue, MultiValueMode sortMode, Nested nested, boolean reverse) {
throw new UnsupportedOperationException("no global ordinals sorting yet");
}
@Override
public void clear() {}
@Override
public long ramBytesUsed() {
return afd.ramBytesUsed();
return memorySizeInBytes;
}
@Override
public Collection<Accountable> getChildResources() {
return afd.getChildResources();
return Collections.emptyList();
}
@Override
public void close() {
public AtomicOrdinalsFieldData load(LeafReaderContext context) {
assert source.getReaderCacheHelper().getKey() == context.parent.reader().getReaderCacheHelper().getKey();
return new AbstractAtomicOrdinalsFieldData(scriptFunction) {
@Override
public SortedSetDocValues getOrdinalsValues() {
final SortedSetDocValues values = segmentAfd[context.ord].getOrdinalsValues();
if (values.getValueCount() == ordinalMap.getValueCount()) {
// segment ordinals match global ordinals
return values;
}
final TermsEnum[] atomicLookups = getOrLoadTermsEnums();
return new GlobalOrdinalMapping(ordinalMap, values, atomicLookups, context.ord);
}
@Override
public long ramBytesUsed() {
return segmentAfd[context.ord].ramBytesUsed();
}
@Override
public Collection<Accountable> getChildResources() {
return segmentAfd[context.ord].getChildResources();
}
@Override
public void close() {}
};
}
@Override
public OrdinalMap getOrdinalMap() {
return ordinalMap;
}
}
}

View File

@ -32,6 +32,7 @@ import org.elasticsearch.index.fielddata.AtomicOrdinalsFieldData;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.fielddata.IndexOrdinalsFieldData;
import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalsBuilder;
import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalsIndexFieldData;
import org.elasticsearch.indices.breaker.CircuitBreakerService;
import java.io.IOException;
@ -60,6 +61,17 @@ public abstract class AbstractIndexOrdinalsFieldData extends AbstractIndexFieldD
@Override
public IndexOrdinalsFieldData loadGlobal(DirectoryReader indexReader) {
IndexOrdinalsFieldData fieldData = loadGlobalInternal(indexReader);
if (fieldData instanceof GlobalOrdinalsIndexFieldData) {
// we create a new instance of the cached value for each consumer in order
// to avoid creating new TermsEnums for each segment in the cached instance
return ((GlobalOrdinalsIndexFieldData) fieldData).newConsumer(indexReader);
} else {
return fieldData;
}
}
private IndexOrdinalsFieldData loadGlobalInternal(DirectoryReader indexReader) {
if (indexReader.leaves().size() <= 1) {
// ordinals are already global
return this;

View File

@ -37,6 +37,7 @@ import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.fielddata.IndexOrdinalsFieldData;
import org.elasticsearch.index.fielddata.ScriptDocValues;
import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource;
import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalsIndexFieldData;
import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalsBuilder;
import org.elasticsearch.indices.breaker.CircuitBreakerService;
import org.elasticsearch.search.MultiValueMode;
@ -92,6 +93,17 @@ public class SortedSetDVOrdinalsIndexFieldData extends DocValuesIndexFieldData i
@Override
public IndexOrdinalsFieldData loadGlobal(DirectoryReader indexReader) {
IndexOrdinalsFieldData fieldData = loadGlobalInternal(indexReader);
if (fieldData instanceof GlobalOrdinalsIndexFieldData) {
// we create a new instance of the cached value for each consumer in order
// to avoid creating new TermsEnums for each segment in the cached instance
return ((GlobalOrdinalsIndexFieldData) fieldData).newConsumer(indexReader);
} else {
return fieldData;
}
}
private IndexOrdinalsFieldData loadGlobalInternal(DirectoryReader indexReader) {
if (indexReader.leaves().size() <= 1) {
// ordinals are already global
return this;

View File

@ -465,7 +465,7 @@ public abstract class AbstractStringFieldDataTestCase extends AbstractFieldDataI
assertThat(topLevelReader.leaves().size(), equalTo(3));
// First segment
assertThat(globalOrdinals, instanceOf(GlobalOrdinalsIndexFieldData.class));
assertThat(globalOrdinals, instanceOf(GlobalOrdinalsIndexFieldData.Consumer.class));
LeafReaderContext leaf = topLevelReader.leaves().get(0);
AtomicOrdinalsFieldData afd = globalOrdinals.load(leaf);
SortedSetDocValues values = afd.getOrdinalsValues();
@ -590,7 +590,7 @@ public abstract class AbstractStringFieldDataTestCase extends AbstractFieldDataI
IndexOrdinalsFieldData ifd = getForField("string", "value", hasDocValues());
IndexOrdinalsFieldData globalOrdinals = ifd.loadGlobal(topLevelReader);
assertNotNull(globalOrdinals.getOrdinalMap());
assertThat(ifd.loadGlobal(topLevelReader), sameInstance(globalOrdinals));
assertThat(ifd.loadGlobal(topLevelReader).getOrdinalMap(), sameInstance(globalOrdinals.getOrdinalMap()));
// 3 b/c 1 segment level caches and 1 top level cache
// in case of doc values, we don't cache atomic FD, so only the top-level cache is there
assertThat(indicesFieldDataCache.getCache().weight(), equalTo(hasDocValues() ? 1L : 4L));
@ -602,7 +602,8 @@ public abstract class AbstractStringFieldDataTestCase extends AbstractFieldDataI
break;
}
}
assertThat(cachedInstance, sameInstance(globalOrdinals));
assertNotSame(cachedInstance, globalOrdinals);
assertThat(cachedInstance.getOrdinalMap(), sameInstance(globalOrdinals.getOrdinalMap()));
topLevelReader.close();
// Now only 3 segment level entries, only the toplevel reader has been closed, but the segment readers are still used by IW
assertThat(indicesFieldDataCache.getCache().weight(), equalTo(hasDocValues() ? 0L : 3L));