Simplify class hierarchy for ordinals field data. (#60606)

This PR simplifies the hierarchy for ordinals field data classes:
* Remove `AbstractIndexFieldData`, since only `AbstractIndexOrdinalsFieldData`
inherits directly from it.
* Make `SortedSetOrdinalsIndexFieldData` extend
`AbstractIndexOrdinalsFieldData`. This lets us remove some redundant code.
This commit is contained in:
Julie Tibshirani 2020-08-03 09:58:29 -07:00 committed by GitHub
parent c5f4f91ac4
commit fc63f8224f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 138 additions and 277 deletions

View File

@ -22,7 +22,7 @@ import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.breaker.CircuitBreaker;
import org.elasticsearch.index.fielddata.plain.AbstractIndexFieldData;
import org.elasticsearch.index.fielddata.plain.AbstractIndexOrdinalsFieldData;
import java.io.IOException;
@ -38,13 +38,14 @@ public final class RamAccountingTermsEnum extends FilteredTermsEnum {
private final CircuitBreaker breaker;
private final TermsEnum termsEnum;
private final AbstractIndexFieldData.PerValueEstimator estimator;
private final AbstractIndexOrdinalsFieldData.PerValueEstimator estimator;
private final String fieldName;
private long totalBytes;
private long flushBuffer;
public RamAccountingTermsEnum(TermsEnum termsEnum, CircuitBreaker breaker, AbstractIndexFieldData.PerValueEstimator estimator,
public RamAccountingTermsEnum(TermsEnum termsEnum, CircuitBreaker breaker,
AbstractIndexOrdinalsFieldData.PerValueEstimator estimator,
String fieldName) {
super(termsEnum);
this.breaker = breaker;

View File

@ -1,125 +0,0 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.fielddata.plain;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.fielddata.LeafFieldData;
import org.elasticsearch.index.fielddata.RamAccountingTermsEnum;
import org.elasticsearch.search.aggregations.support.ValuesSourceType;
import java.io.IOException;
public abstract class AbstractIndexFieldData<FD extends LeafFieldData> implements IndexFieldData<FD> {
private final String fieldName;
private ValuesSourceType valuesSourceType;
protected final IndexFieldDataCache cache;
public AbstractIndexFieldData(
String fieldName,
ValuesSourceType valuesSourceType,
IndexFieldDataCache cache
) {
this.fieldName = fieldName;
this.valuesSourceType = valuesSourceType;
this.cache = cache;
}
@Override
public String getFieldName() {
return this.fieldName;
}
@Override
public ValuesSourceType getValuesSourceType() {
return valuesSourceType;
}
@Override
public FD load(LeafReaderContext context) {
if (context.reader().getFieldInfos().fieldInfo(fieldName) == null) {
// Some leaf readers may be wrapped and report different set of fields and use the same cache key.
// If a field can't be found then it doesn't mean it isn't there,
// so if a field doesn't exist then we don't cache it and just return an empty field data instance.
// The next time the field is found, we do cache.
return empty(context.reader().maxDoc());
}
try {
FD fd = cache.load(context, this);
return fd;
} catch (Exception e) {
if (e instanceof ElasticsearchException) {
throw (ElasticsearchException) e;
} else {
throw new ElasticsearchException(e);
}
}
}
/**
* @param maxDoc of the current reader
* @return an empty field data instances for field data lookups of empty segments (returning no values)
*/
protected abstract FD empty(int maxDoc);
/**
* A {@code PerValueEstimator} is a sub-class that can be used to estimate
* the memory overhead for loading the data. Each field data
* implementation should implement its own {@code PerValueEstimator} if it
* intends to take advantage of the CircuitBreaker.
* <p>
* Note that the .beforeLoad(...) and .afterLoad(...) methods must be
* manually called.
*/
public interface PerValueEstimator {
/**
* @return the number of bytes for the given term
*/
long bytesPerValue(BytesRef term);
/**
* Execute any pre-loading estimations for the terms. May also
* optionally wrap a {@link TermsEnum} in a
* {@link RamAccountingTermsEnum}
* which will estimate the memory on a per-term basis.
*
* @param terms terms to be estimated
* @return A TermsEnum for the given terms
*/
TermsEnum beforeLoad(Terms terms) throws IOException;
/**
* Possibly adjust a circuit breaker after field data has been loaded,
* now that the actual amount of memory used by the field data is known
*
* @param termsEnum terms that were loaded
* @param actualUsed actual field data memory usage
*/
void afterLoad(TermsEnum termsEnum, long actualUsed);
}
}

View File

@ -21,10 +21,9 @@ package org.elasticsearch.index.fielddata.plain;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
@ -32,35 +31,47 @@ import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.fielddata.IndexOrdinalsFieldData;
import org.elasticsearch.index.fielddata.LeafOrdinalsFieldData;
import org.elasticsearch.index.fielddata.RamAccountingTermsEnum;
import org.elasticsearch.index.fielddata.ScriptDocValues;
import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalsBuilder;
import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalsIndexFieldData;
import org.elasticsearch.indices.breaker.CircuitBreakerService;
import org.elasticsearch.search.aggregations.support.ValuesSourceType;
import java.io.IOException;
import java.util.function.Function;
public abstract class AbstractIndexOrdinalsFieldData extends AbstractIndexFieldData<LeafOrdinalsFieldData>
implements IndexOrdinalsFieldData {
public abstract class AbstractIndexOrdinalsFieldData implements IndexOrdinalsFieldData {
private static final Logger logger = LogManager.getLogger(AbstractBinaryDVLeafFieldData.class);
private final double minFrequency, maxFrequency;
private final int minSegmentSize;
private final String fieldName;
private final ValuesSourceType valuesSourceType;
private final IndexFieldDataCache cache;
protected final CircuitBreakerService breakerService;
protected final Function<SortedSetDocValues, ScriptDocValues<?>> scriptFunction;
protected AbstractIndexOrdinalsFieldData(
String fieldName,
ValuesSourceType valuesSourceType,
IndexFieldDataCache cache,
CircuitBreakerService breakerService,
double minFrequency,
double maxFrequency,
int minSegmentSize
Function<SortedSetDocValues, ScriptDocValues<?>> scriptFunction
) {
super(fieldName, valuesSourceType, cache);
this.fieldName = fieldName;
this.valuesSourceType = valuesSourceType;
this.cache = cache;
this.breakerService = breakerService;
this.minFrequency = minFrequency;
this.maxFrequency = maxFrequency;
this.minSegmentSize = minSegmentSize;
this.scriptFunction = scriptFunction;
}
@Override
public String getFieldName() {
return this.fieldName;
}
@Override
public ValuesSourceType getValuesSourceType() {
return valuesSourceType;
}
@Override
@ -68,6 +79,27 @@ public abstract class AbstractIndexOrdinalsFieldData extends AbstractIndexFieldD
return null;
}
@Override
public LeafOrdinalsFieldData load(LeafReaderContext context) {
if (context.reader().getFieldInfos().fieldInfo(fieldName) == null) {
// Some leaf readers may be wrapped and report different set of fields and use the same cache key.
// If a field can't be found then it doesn't mean it isn't there,
// so if a field doesn't exist then we don't cache it and just return an empty field data instance.
// The next time the field is found, we do cache.
return AbstractLeafOrdinalsFieldData.empty();
}
try {
return cache.load(context, this);
} catch (Exception e) {
if (e instanceof ElasticsearchException) {
throw (ElasticsearchException) e;
} else {
throw new ElasticsearchException(e);
}
}
}
@Override
public IndexOrdinalsFieldData loadGlobal(DirectoryReader indexReader) {
IndexOrdinalsFieldData fieldData = loadGlobalInternal(indexReader);
@ -121,60 +153,49 @@ public abstract class AbstractIndexOrdinalsFieldData extends AbstractIndexFieldD
this,
breakerService,
logger,
AbstractLeafOrdinalsFieldData.DEFAULT_SCRIPT_FUNCTION
scriptFunction
);
}
@Override
protected LeafOrdinalsFieldData empty(int maxDoc) {
return AbstractLeafOrdinalsFieldData.empty();
}
protected TermsEnum filter(Terms terms, TermsEnum iterator, LeafReader reader) throws IOException {
if (iterator == null) {
return null;
}
int docCount = terms.getDocCount();
if (docCount == -1) {
docCount = reader.maxDoc();
}
if (docCount >= minSegmentSize) {
final int minFreq = minFrequency > 1.0
? (int) minFrequency
: (int)(docCount * minFrequency);
final int maxFreq = maxFrequency > 1.0
? (int) maxFrequency
: (int)(docCount * maxFrequency);
if (minFreq > 1 || maxFreq < docCount) {
iterator = new FrequencyFilter(iterator, minFreq, maxFreq);
}
}
return iterator;
}
@Override
public boolean supportsGlobalOrdinalsMapping() {
return false;
}
private static final class FrequencyFilter extends FilteredTermsEnum {
/**
* A {@code PerValueEstimator} is a sub-class that can be used to estimate
* the memory overhead for loading the data. Each field data
* implementation should implement its own {@code PerValueEstimator} if it
* intends to take advantage of the CircuitBreaker.
* <p>
* Note that the .beforeLoad(...) and .afterLoad(...) methods must be
* manually called.
*/
public interface PerValueEstimator {
private int minFreq;
private int maxFreq;
FrequencyFilter(TermsEnum delegate, int minFreq, int maxFreq) {
super(delegate, false);
this.minFreq = minFreq;
this.maxFreq = maxFreq;
}
/**
* @return the number of bytes for the given term
*/
long bytesPerValue(BytesRef term);
@Override
protected AcceptStatus accept(BytesRef arg0) throws IOException {
int docFreq = docFreq();
if (docFreq >= minFreq && docFreq <= maxFreq) {
return AcceptStatus.YES;
}
return AcceptStatus.NO;
}
/**
* Execute any pre-loading estimations for the terms. May also
* optionally wrap a {@link TermsEnum} in a
* {@link RamAccountingTermsEnum}
* which will estimate the memory on a per-term basis.
*
* @param terms terms to be estimated
* @return A TermsEnum for the given terms
*/
TermsEnum beforeLoad(Terms terms) throws IOException;
/**
* Possibly adjust a circuit breaker after field data has been loaded,
* now that the actual amount of memory used by the field data is known
*
* @param termsEnum terms that were loaded
* @param actualUsed actual field data memory usage
*/
void afterLoad(TermsEnum termsEnum, long actualUsed);
}
}

View File

@ -37,7 +37,6 @@ import org.elasticsearch.index.fielddata.IndexOrdinalsFieldData;
import org.elasticsearch.index.fielddata.LeafOrdinalsFieldData;
import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.TextFieldMapper;
import org.elasticsearch.indices.breaker.CircuitBreakerService;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.MultiValueMode;
@ -139,10 +138,7 @@ public class ConstantIndexFieldData extends AbstractIndexOrdinalsFieldData {
private final ConstantLeafFieldData atomicFieldData;
private ConstantIndexFieldData(String name, String value, ValuesSourceType valuesSourceType) {
super(name, valuesSourceType, null, null,
TextFieldMapper.Defaults.FIELDDATA_MIN_FREQUENCY,
TextFieldMapper.Defaults.FIELDDATA_MAX_FREQUENCY,
TextFieldMapper.Defaults.FIELDDATA_MIN_SEGMENT_SIZE);
super(name, valuesSourceType, null, null, AbstractLeafOrdinalsFieldData.DEFAULT_SCRIPT_FUNCTION);
atomicFieldData = new ConstantLeafFieldData(value);
}

View File

@ -22,6 +22,7 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.codecs.blocktree.FieldReader;
import org.apache.lucene.codecs.blocktree.Stats;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
@ -58,6 +59,9 @@ import java.io.IOException;
public class PagedBytesIndexFieldData extends AbstractIndexOrdinalsFieldData {
private static final Logger logger = LogManager.getLogger(PagedBytesIndexFieldData.class);
private final double minFrequency, maxFrequency;
private final int minSegmentSize;
public static class Builder implements IndexFieldData.Builder {
private final String name;
private final double minFrequency, maxFrequency;
@ -88,7 +92,10 @@ public class PagedBytesIndexFieldData extends AbstractIndexOrdinalsFieldData {
double maxFrequency,
int minSegmentSize
) {
super(fieldName, valuesSourceType, cache, breakerService, minFrequency, maxFrequency, minSegmentSize);
super(fieldName, valuesSourceType, cache, breakerService, AbstractLeafOrdinalsFieldData.DEFAULT_SCRIPT_FUNCTION);
this.minFrequency = minFrequency;
this.maxFrequency = maxFrequency;
this.minSegmentSize = minSegmentSize;
}
@Override
@ -255,6 +262,28 @@ public class PagedBytesIndexFieldData extends AbstractIndexOrdinalsFieldData {
}
}
private TermsEnum filter(Terms terms, TermsEnum iterator, LeafReader reader) throws IOException {
if (iterator == null) {
return null;
}
int docCount = terms.getDocCount();
if (docCount == -1) {
docCount = reader.maxDoc();
}
if (docCount >= minSegmentSize) {
final int minFreq = minFrequency > 1.0
? (int) minFrequency
: (int)(docCount * minFrequency);
final int maxFreq = maxFrequency > 1.0
? (int) maxFrequency
: (int)(docCount * maxFrequency);
if (minFreq > 1 || maxFreq < docCount) {
iterator = new FrequencyFilter(iterator, minFreq, maxFreq);
}
}
return iterator;
}
/**
* Adjust the circuit breaker now that terms have been loaded, getting
* the actual used either from the parameter (if estimation worked for
@ -271,6 +300,25 @@ public class PagedBytesIndexFieldData extends AbstractIndexOrdinalsFieldData {
}
breaker.addWithoutBreaking(-(estimatedBytes - actualUsed));
}
}
private static final class FrequencyFilter extends FilteredTermsEnum {
private final int minFreq;
private final int maxFreq;
FrequencyFilter(TermsEnum delegate, int minFreq, int maxFreq) {
super(delegate, false);
this.minFreq = minFreq;
this.maxFreq = maxFreq;
}
@Override
protected AcceptStatus accept(BytesRef arg0) throws IOException {
int docFreq = docFreq();
if (docFreq >= minFreq && docFreq <= maxFreq) {
return AcceptStatus.YES;
}
return AcceptStatus.NO;
}
}
}

View File

@ -19,27 +19,20 @@
package org.elasticsearch.index.fielddata.plain;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedSetSelector;
import org.apache.lucene.search.SortedSetSortField;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.fielddata.IndexOrdinalsFieldData;
import org.elasticsearch.index.fielddata.LeafOrdinalsFieldData;
import org.elasticsearch.index.fielddata.ScriptDocValues;
import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource;
import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalsBuilder;
import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalsIndexFieldData;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.indices.breaker.CircuitBreakerService;
import org.elasticsearch.search.DocValueFormat;
@ -48,10 +41,9 @@ import org.elasticsearch.search.aggregations.support.ValuesSourceType;
import org.elasticsearch.search.sort.BucketedSort;
import org.elasticsearch.search.sort.SortOrder;
import java.io.IOException;
import java.util.function.Function;
public class SortedSetOrdinalsIndexFieldData implements IndexOrdinalsFieldData {
public class SortedSetOrdinalsIndexFieldData extends AbstractIndexOrdinalsFieldData {
public static class Builder implements IndexFieldData.Builder {
private final String name;
@ -78,13 +70,6 @@ public class SortedSetOrdinalsIndexFieldData implements IndexOrdinalsFieldData {
}
}
protected final String fieldName;
private final IndexFieldDataCache cache;
private final CircuitBreakerService breakerService;
private final Function<SortedSetDocValues, ScriptDocValues<?>> scriptFunction;
private final ValuesSourceType valuesSourceType;
private static final Logger logger = LogManager.getLogger(SortedSetOrdinalsIndexFieldData.class);
public SortedSetOrdinalsIndexFieldData(
IndexFieldDataCache cache,
String fieldName,
@ -92,21 +77,7 @@ public class SortedSetOrdinalsIndexFieldData implements IndexOrdinalsFieldData {
CircuitBreakerService breakerService,
Function<SortedSetDocValues, ScriptDocValues<?>> scriptFunction
) {
this.fieldName = fieldName;
this.valuesSourceType = valuesSourceType;
this.cache = cache;
this.breakerService = breakerService;
this.scriptFunction = scriptFunction;
}
@Override
public final String getFieldName() {
return fieldName;
}
@Override
public ValuesSourceType getValuesSourceType() {
return valuesSourceType;
super(fieldName, valuesSourceType, cache, breakerService, scriptFunction);
}
@Override
@ -121,7 +92,7 @@ public class SortedSetOrdinalsIndexFieldData implements IndexOrdinalsFieldData {
(source.sortMissingLast(missingValue) == false && source.sortMissingFirst(missingValue) == false)) {
return new SortField(getFieldName(), source, reverse);
}
SortField sortField = new SortedSetSortField(fieldName, reverse,
SortField sortField = new SortedSetSortField(getFieldName(), reverse,
sortMode == MultiValueMode.MAX ? SortedSetSelector.Type.MAX : SortedSetSelector.Type.MIN);
sortField.setMissingValue(source.sortMissingLast(missingValue) ^ reverse ?
SortedSetSortField.STRING_LAST : SortedSetSortField.STRING_FIRST);
@ -136,65 +107,14 @@ public class SortedSetOrdinalsIndexFieldData implements IndexOrdinalsFieldData {
@Override
public LeafOrdinalsFieldData load(LeafReaderContext context) {
return new SortedSetBytesLeafFieldData(context.reader(), fieldName, scriptFunction);
return new SortedSetBytesLeafFieldData(context.reader(), getFieldName(), scriptFunction);
}
@Override
public LeafOrdinalsFieldData loadDirect(LeafReaderContext context) throws Exception {
public LeafOrdinalsFieldData loadDirect(LeafReaderContext context) {
return load(context);
}
@Override
public IndexOrdinalsFieldData loadGlobal(DirectoryReader indexReader) {
IndexOrdinalsFieldData fieldData = loadGlobalInternal(indexReader);
if (fieldData instanceof GlobalOrdinalsIndexFieldData) {
// we create a new instance of the cached value for each consumer in order
// to avoid creating new TermsEnums for each segment in the cached instance
return ((GlobalOrdinalsIndexFieldData) fieldData).newConsumer(indexReader);
} else {
return fieldData;
}
}
private IndexOrdinalsFieldData loadGlobalInternal(DirectoryReader indexReader) {
if (indexReader.leaves().size() <= 1) {
// ordinals are already global
return this;
}
boolean fieldFound = false;
for (LeafReaderContext context : indexReader.leaves()) {
if (context.reader().getFieldInfos().fieldInfo(getFieldName()) != null) {
fieldFound = true;
break;
}
}
if (fieldFound == false) {
// Some directory readers may be wrapped and report different set of fields and use the same cache key.
// If a field can't be found then it doesn't mean it isn't there,
// so if a field doesn't exist then we don't cache it and just return an empty field data instance.
// The next time the field is found, we do cache.
try {
return GlobalOrdinalsBuilder.buildEmpty(indexReader, this);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
try {
return cache.load(indexReader, this);
} catch (Exception e) {
if (e instanceof ElasticsearchException) {
throw (ElasticsearchException) e;
} else {
throw new ElasticsearchException(e);
}
}
}
@Override
public IndexOrdinalsFieldData loadGlobalDirect(DirectoryReader indexReader) throws Exception {
return GlobalOrdinalsBuilder.build(indexReader, this, breakerService, logger, scriptFunction);
}
@Override
public OrdinalMap getOrdinalMap() {
return null;