Fix illegal cast of the "low cardinality" optimization of the `terms` aggregation. (#27543)

The GlobalOrdinalsStringTermsAggregator.LowCardinality aggregator casts global
values to `GlobalOrdinalMapping`, even though the implementation of global
values is different when a `missing` value is configured.

This commit adds a new API that gives access to the ordinal remapping in order
to fix this problem.
This commit is contained in:
Adrien Grand 2017-11-28 14:55:09 +01:00 committed by GitHub
parent 996990ad1f
commit d01fcee645
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 229 additions and 70 deletions

View File

@ -29,7 +29,7 @@ import java.io.IOException;
/**
* A {@link SortedSetDocValues} implementation that returns ordinals that are global.
*/
public class GlobalOrdinalMapping extends SortedSetDocValues {
final class GlobalOrdinalMapping extends SortedSetDocValues {
private final SortedSetDocValues values;
private final OrdinalMap ordinalMap;
@ -49,7 +49,7 @@ public class GlobalOrdinalMapping extends SortedSetDocValues {
return ordinalMap.getValueCount();
}
public final long getGlobalOrd(long segmentOrd) {
public long getGlobalOrd(long segmentOrd) {
return mapping.get(segmentOrd);
}

View File

@ -25,7 +25,6 @@ import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
import org.elasticsearch.index.fielddata.SortedNumericDoubleValues;
import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalMapping;
import org.elasticsearch.search.aggregations.support.ValuesSource;
import org.elasticsearch.search.sort.SortOrder;
@ -179,16 +178,10 @@ abstract class CompositeValuesSource<VS extends ValuesSource, T extends Comparab
if (lookup == null) {
lookup = dvs;
if (topValue != null && topValueLong == null) {
if (lookup instanceof GlobalOrdinalMapping) {
// Find the global ordinal (or the insertion point) for the provided top value.
topValueLong = lookupGlobalOrdinals((GlobalOrdinalMapping) lookup, topValue);
} else {
// Global ordinals are not needed, switch back to ordinals (single segment case).
topValueLong = lookup.lookupTerm(topValue);
if (topValueLong < 0) {
// convert negative insert position
topValueLong = -topValueLong - 2;
}
topValueLong = lookup.lookupTerm(topValue);
if (topValueLong < 0) {
// convert negative insert position
topValueLong = -topValueLong - 2;
}
}
}
@ -202,25 +195,6 @@ abstract class CompositeValuesSource<VS extends ValuesSource, T extends Comparab
}
};
}
private static long lookupGlobalOrdinals(GlobalOrdinalMapping mapping, BytesRef key) throws IOException {
long low = 0;
long high = mapping.getValueCount();
while (low <= high) {
long mid = (low + high) >>> 1;
BytesRef midVal = mapping.lookupOrd(mid);
int cmp = midVal.compareTo(key);
if (cmp < 0) {
low = mid + 1;
} else if (cmp > 0) {
high = mid - 1;
} else {
return mid;
}
}
return low-1;
}
}
/**

View File

@ -33,7 +33,6 @@ import org.elasticsearch.common.util.IntArray;
import org.elasticsearch.common.util.LongHash;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.fielddata.AbstractSortedSetDocValues;
import org.elasticsearch.index.fielddata.ordinals.GlobalOrdinalMapping;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.aggregations.Aggregator;
import org.elasticsearch.search.aggregations.AggregatorFactories;
@ -50,6 +49,7 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.function.LongUnaryOperator;
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
@ -295,9 +295,8 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
*/
static class LowCardinality extends GlobalOrdinalsStringTermsAggregator {
private LongUnaryOperator mapping;
private IntArray segmentDocCounts;
private SortedSetDocValues globalOrds;
private SortedSetDocValues segmentOrds;
LowCardinality(String name,
AggregatorFactories factories,
@ -321,14 +320,14 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
@Override
public LeafBucketCollector getLeafCollector(LeafReaderContext ctx,
final LeafBucketCollector sub) throws IOException {
if (segmentOrds != null) {
mapSegmentCountsToGlobalCounts();
if (mapping != null) {
mapSegmentCountsToGlobalCounts(mapping);
}
globalOrds = valuesSource.globalOrdinalsValues(ctx);
segmentOrds = valuesSource.ordinalsValues(ctx);
final SortedSetDocValues segmentOrds = valuesSource.ordinalsValues(ctx);
segmentDocCounts = context.bigArrays().grow(segmentDocCounts, 1 + segmentOrds.getValueCount());
assert sub == LeafBucketCollector.NO_OP_COLLECTOR;
final SortedDocValues singleValues = DocValues.unwrapSingleton(segmentOrds);
mapping = valuesSource.globalOrdinalsMapping(ctx);
if (singleValues != null) {
return new LeafBucketCollectorBase(sub, segmentOrds) {
@Override
@ -356,9 +355,10 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
}
@Override
protected void doPostCollection() {
if (segmentOrds != null) {
mapSegmentCountsToGlobalCounts();
protected void doPostCollection() throws IOException {
if (mapping != null) {
mapSegmentCountsToGlobalCounts(mapping);
mapping = null;
}
}
@ -367,16 +367,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
Releasables.close(segmentDocCounts);
}
private void mapSegmentCountsToGlobalCounts() {
// There is no public method in Ordinals.Docs that allows for this mapping...
// This is the cleanest way I can think of so far
GlobalOrdinalMapping mapping;
if (globalOrds.getValueCount() == segmentOrds.getValueCount()) {
mapping = null;
} else {
mapping = (GlobalOrdinalMapping) globalOrds;
}
private void mapSegmentCountsToGlobalCounts(LongUnaryOperator mapping) throws IOException {
for (long i = 1; i < segmentDocCounts.size(); i++) {
// We use set(...) here, because we need to reset the slow to 0.
// segmentDocCounts get reused over the segments and otherwise counts would be too high.
@ -385,7 +376,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
continue;
}
final long ord = i - 1; // remember we do +1 when counting
final long globalOrd = mapping == null ? ord : mapping.getGlobalOrd(ord);
final long globalOrd = mapping.applyAsLong(ord);
long bucketOrd = getBucketOrd(globalOrd);
incrementBucketDocCount(bucketOrd, inc);
}

View File

@ -49,6 +49,8 @@ import java.util.Map;
public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory<ValuesSource, TermsAggregatorFactory> {
private static final DeprecationLogger DEPRECATION_LOGGER = new DeprecationLogger(Loggers.getLogger(TermsAggregatorFactory.class));
static Boolean REMAP_GLOBAL_ORDS, COLLECT_SEGMENT_ORDS;
private final BucketOrder order;
private final IncludeExclude includeExclude;
private final String executionHint;
@ -257,11 +259,13 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory<Values
final long maxOrd = getMaxOrd(valuesSource, context.searcher());
assert maxOrd != -1;
final double ratio = maxOrd / ((double) context.searcher().getIndexReader().numDocs());
final double ratio = maxOrd / ((double) context.searcher().getIndexReader().numDocs());
if (factories == AggregatorFactories.EMPTY &&
includeExclude == null &&
Aggregator.descendsFromBucketAggregator(parent) == false &&
ratio <= 0.5 && maxOrd <= 2048) {
// we use the static COLLECT_SEGMENT_ORDS to allow tests to force specific optimizations
(COLLECT_SEGMENT_ORDS!= null ? COLLECT_SEGMENT_ORDS.booleanValue() : ratio <= 0.5 && maxOrd <= 2048)) {
/**
* We can use the low cardinality execution mode iff this aggregator:
* - has no sub-aggregator AND
@ -276,18 +280,24 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory<Values
}
final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter(format);
boolean remapGlobalOrds = true;
if (includeExclude == null &&
Aggregator.descendsFromBucketAggregator(parent) == false &&
(factories == AggregatorFactories.EMPTY ||
(isAggregationSort(order) == false && subAggCollectMode == SubAggCollectionMode.BREADTH_FIRST))) {
/**
* We don't need to remap global ords iff this aggregator:
* - has no include/exclude rules AND
* - is not a child of a bucket aggregator AND
* - has no sub-aggregator or only sub-aggregator that can be deferred ({@link SubAggCollectionMode#BREADTH_FIRST}).
**/
remapGlobalOrds = false;
boolean remapGlobalOrds;
if (REMAP_GLOBAL_ORDS != null) {
// We use REMAP_GLOBAL_ORDS to allow tests to force specific optimizations
remapGlobalOrds = REMAP_GLOBAL_ORDS.booleanValue();
} else {
remapGlobalOrds = true;
if (includeExclude == null &&
Aggregator.descendsFromBucketAggregator(parent) == false &&
(factories == AggregatorFactories.EMPTY ||
(isAggregationSort(order) == false && subAggCollectMode == SubAggCollectionMode.BREADTH_FIRST))) {
/**
* We don't need to remap global ords iff this aggregator:
* - has no include/exclude rules AND
* - is not a child of a bucket aggregator AND
* - has no sub-aggregator or only sub-aggregator that can be deferred ({@link SubAggCollectionMode#BREADTH_FIRST}).
**/
remapGlobalOrds = false;
}
}
return new GlobalOrdinalsStringTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals) valuesSource, order,
format, bucketCountThresholds, filter, context, parent, remapGlobalOrds, subAggCollectMode, showTermDocCountError,

View File

@ -31,6 +31,7 @@ import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
import org.elasticsearch.index.fielddata.SortedNumericDoubleValues;
import java.io.IOException;
import java.util.function.LongUnaryOperator;
/**
* Utility class that allows to return views of {@link ValuesSource}s that
@ -201,6 +202,13 @@ public enum MissingValues {
SortedSetDocValues values = valuesSource.globalOrdinalsValues(context);
return replaceMissing(values, missing);
}
@Override
public LongUnaryOperator globalOrdinalsMapping(LeafReaderContext context) throws IOException {
return getGlobalMapping(valuesSource.ordinalsValues(context),
valuesSource.globalOrdinalsValues(context),
valuesSource.globalOrdinalsMapping(context), missing);
}
};
}
@ -311,6 +319,43 @@ public enum MissingValues {
};
}
static LongUnaryOperator getGlobalMapping(SortedSetDocValues values, SortedSetDocValues globalValues,
LongUnaryOperator segmentToGlobalOrd, BytesRef missing) throws IOException {
final long missingGlobalOrd = globalValues.lookupTerm(missing);
final long missingSegmentOrd = values.lookupTerm(missing);
if (missingSegmentOrd >= 0) {
// the missing value exists in the segment, nothing to do
return segmentToGlobalOrd;
} else if (missingGlobalOrd >= 0) {
// the missing value exists in another segment, but not the current one
final long insertedSegmentOrd = -1L - missingSegmentOrd;
final long insertedGlobalOrd = missingGlobalOrd;
return segmentOrd -> {
if (insertedSegmentOrd == segmentOrd) {
return insertedGlobalOrd;
} else if (insertedSegmentOrd > segmentOrd) {
return segmentToGlobalOrd.applyAsLong(segmentOrd);
} else {
return segmentToGlobalOrd.applyAsLong(segmentOrd - 1);
}
};
} else {
// the missing value exists neither in this segment nor in another segment
final long insertedSegmentOrd = -1L - missingSegmentOrd;
final long insertedGlobalOrd = -1L - missingGlobalOrd;
return segmentOrd -> {
if (insertedSegmentOrd == segmentOrd) {
return insertedGlobalOrd;
} else if (insertedSegmentOrd > segmentOrd) {
return segmentToGlobalOrd.applyAsLong(segmentOrd);
} else {
return 1 + segmentToGlobalOrd.applyAsLong(segmentOrd - 1);
}
};
}
}
public static ValuesSource.GeoPoint replaceMissing(final ValuesSource.GeoPoint valuesSource, final GeoPoint missing) {
return new ValuesSource.GeoPoint() {

View File

@ -22,6 +22,7 @@ import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.IndexSearcher;
@ -47,6 +48,7 @@ import org.elasticsearch.search.aggregations.support.values.ScriptDoubleValues;
import org.elasticsearch.search.aggregations.support.values.ScriptLongValues;
import java.io.IOException;
import java.util.function.LongUnaryOperator;
public abstract class ValuesSource {
@ -90,6 +92,11 @@ public abstract class ValuesSource {
return org.elasticsearch.index.fielddata.FieldData.emptySortedBinary();
}
@Override
public LongUnaryOperator globalOrdinalsMapping(LeafReaderContext context) throws IOException {
return LongUnaryOperator.identity();
}
};
@Override
@ -105,6 +112,10 @@ public abstract class ValuesSource {
public abstract SortedSetDocValues globalOrdinalsValues(LeafReaderContext context)
throws IOException;
/** Returns a mapping from segment ordinals to global ordinals. */
public abstract LongUnaryOperator globalOrdinalsMapping(LeafReaderContext context)
throws IOException;
public long globalMaxOrd(IndexSearcher indexSearcher) throws IOException {
IndexReader indexReader = indexSearcher.getIndexReader();
if (indexReader.leaves().isEmpty()) {
@ -142,6 +153,18 @@ public abstract class ValuesSource {
final AtomicOrdinalsFieldData atomicFieldData = global.load(context);
return atomicFieldData.getOrdinalsValues();
}
@Override
public LongUnaryOperator globalOrdinalsMapping(LeafReaderContext context) throws IOException {
final IndexOrdinalsFieldData global = indexFieldData.loadGlobal((DirectoryReader)context.parent.reader());
final OrdinalMap map = global.getOrdinalMap();
if (map == null) {
// segments and global ordinals are the same
return LongUnaryOperator.identity();
}
final org.apache.lucene.util.LongValues segmentToGlobalOrd = map.getGlobalOrds(context.ord);
return segmentToGlobalOrd::get;
}
}
}

View File

@ -16,7 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.aggregations.bucket;
package org.elasticsearch.search.aggregations.bucket.terms;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.index.IndexRequestBuilder;
@ -34,9 +34,11 @@ import org.elasticsearch.search.aggregations.AggregationExecutionException;
import org.elasticsearch.search.aggregations.AggregationTestScriptsPlugin;
import org.elasticsearch.search.aggregations.Aggregator.SubAggCollectionMode;
import org.elasticsearch.search.aggregations.BucketOrder;
import org.elasticsearch.search.aggregations.bucket.AbstractTermsTestCase;
import org.elasticsearch.search.aggregations.bucket.filter.Filter;
import org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude;
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregatorFactory;
import org.elasticsearch.search.aggregations.bucket.terms.Terms.Bucket;
import org.elasticsearch.search.aggregations.metrics.avg.Avg;
import org.elasticsearch.search.aggregations.metrics.stats.Stats;
@ -44,6 +46,8 @@ import org.elasticsearch.search.aggregations.metrics.stats.extended.ExtendedStat
import org.elasticsearch.search.aggregations.metrics.sum.Sum;
import org.elasticsearch.test.ESIntegTestCase;
import org.hamcrest.Matchers;
import org.junit.After;
import org.junit.Before;
import java.io.IOException;
import java.util.ArrayList;
@ -84,6 +88,18 @@ public class StringTermsIT extends AbstractTermsTestCase {
return Collections.singleton(CustomScriptPlugin.class);
}
@Before
public void randomizeOptimizations() {
TermsAggregatorFactory.COLLECT_SEGMENT_ORDS = false;randomBoolean();
TermsAggregatorFactory.REMAP_GLOBAL_ORDS = randomBoolean();
}
@After
public void resetOptimizations() {
TermsAggregatorFactory.COLLECT_SEGMENT_ORDS = null;
TermsAggregatorFactory.REMAP_GLOBAL_ORDS = null;
}
public static class CustomScriptPlugin extends AggregationTestScriptsPlugin {
@Override

View File

@ -38,6 +38,7 @@ import org.apache.lucene.util.NumericUtils;
import org.elasticsearch.common.network.InetAddresses;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.MockBigArrays;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.mapper.IpFieldMapper;
import org.elasticsearch.index.mapper.KeywordFieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
@ -68,7 +69,27 @@ import java.util.function.Function;
import static org.hamcrest.Matchers.instanceOf;
public class TermsAggregatorTests extends AggregatorTestCase {
private boolean randomizeAggregatorImpl = true;
@Override
protected <A extends Aggregator> A createAggregator(AggregationBuilder aggregationBuilder,
IndexSearcher indexSearcher, IndexSettings indexSettings, MappedFieldType... fieldTypes) throws IOException {
try {
if (randomizeAggregatorImpl) {
TermsAggregatorFactory.COLLECT_SEGMENT_ORDS = randomBoolean();
TermsAggregatorFactory.REMAP_GLOBAL_ORDS = randomBoolean();
}
return super.createAggregator(aggregationBuilder, indexSearcher, indexSettings, fieldTypes);
} finally {
TermsAggregatorFactory.COLLECT_SEGMENT_ORDS = null;
TermsAggregatorFactory.REMAP_GLOBAL_ORDS = null;
}
}
public void testGlobalOrdinalsExecutionHint() throws Exception {
randomizeAggregatorImpl = false;
Directory directory = newDirectory();
RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory);
indexWriter.close();

View File

@ -38,6 +38,7 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.function.LongUnaryOperator;
public class MissingValuesTests extends ESTestCase {
@ -111,7 +112,7 @@ public class MissingValuesTests extends ESTestCase {
ords[i][j] = TestUtil.nextInt(random(), ords[i][j], maxOrd - 1);
}
}
SortedSetDocValues asRandomAccessOrds = new AbstractSortedSetDocValues() {
SortedSetDocValues asSortedSet = new AbstractSortedSetDocValues() {
int doc = -1;
int i;
@ -147,7 +148,7 @@ public class MissingValuesTests extends ESTestCase {
final BytesRef missingMissing = new BytesRef(RandomStrings.randomAsciiOfLength(random(), 5));
for (BytesRef missing : Arrays.asList(existingMissing, missingMissing)) {
SortedSetDocValues withMissingReplaced = MissingValues.replaceMissing(asRandomAccessOrds, missing);
SortedSetDocValues withMissingReplaced = MissingValues.replaceMissing(asSortedSet, missing);
if (valueSet.contains(missing)) {
assertEquals(values.length, withMissingReplaced.getValueCount());
} else {
@ -169,6 +170,84 @@ public class MissingValuesTests extends ESTestCase {
}
}
public void testGlobalMapping() throws IOException {
final int numOrds = TestUtil.nextInt(random(), 1, 10);
final int numGlobalOrds = TestUtil.nextInt(random(), numOrds, numOrds + 3);
final Set<BytesRef> valueSet = new HashSet<>();
while (valueSet.size() < numOrds) {
valueSet.add(new BytesRef(RandomStrings.randomAsciiLettersOfLength(random(), 5)));
}
final BytesRef[] values = valueSet.toArray(new BytesRef[0]);
Arrays.sort(values);
final Set<BytesRef> globalValueSet = new HashSet<>(valueSet);
while (globalValueSet.size() < numGlobalOrds) {
globalValueSet.add(new BytesRef(RandomStrings.randomAsciiLettersOfLength(random(), 5)));
}
final BytesRef[] globalValues = globalValueSet.toArray(new BytesRef[0]);
Arrays.sort(globalValues);
// exists in the current segment
BytesRef missing = RandomPicks.randomFrom(random(), values);
doTestGlobalMapping(values, globalValues, missing);
// missing in all segments
do {
missing = new BytesRef(RandomStrings.randomAsciiLettersOfLength(random(), 5));
} while (globalValueSet.contains(missing));
doTestGlobalMapping(values, globalValues, missing);
if (globalValueSet.size() > valueSet.size()) {
// exists in other segments only
Set<BytesRef> other = new HashSet<>(globalValueSet);
other.removeAll(valueSet);
missing = RandomPicks.randomFrom(random(), other.toArray(new BytesRef[0]));
doTestGlobalMapping(values, globalValues, missing);
}
}
private void doTestGlobalMapping(BytesRef[] values, BytesRef[] globalValues, BytesRef missing) throws IOException {
LongUnaryOperator segmentToGlobalOrd = segmentOrd -> Arrays.binarySearch(globalValues, values[Math.toIntExact(segmentOrd)]);
SortedSetDocValues sortedValues = asOrds(values);
SortedSetDocValues sortedGlobalValues = asOrds(globalValues);
LongUnaryOperator withMissingSegmentToGlobalOrd = MissingValues.getGlobalMapping(
sortedValues, sortedGlobalValues, segmentToGlobalOrd, missing);
SortedSetDocValues withMissingValues = MissingValues.replaceMissing(sortedValues, missing);
SortedSetDocValues withMissingGlobalValues = MissingValues.replaceMissing(sortedGlobalValues, missing);
for (long segmentOrd = 0; segmentOrd < withMissingValues.getValueCount(); ++segmentOrd) {
long expectedGlobalOrd = withMissingSegmentToGlobalOrd.applyAsLong(segmentOrd);
assertEquals(withMissingValues.lookupOrd(segmentOrd), withMissingGlobalValues.lookupOrd(expectedGlobalOrd));
}
}
private static SortedSetDocValues asOrds(BytesRef[] values) {
return new AbstractSortedSetDocValues() {
@Override
public boolean advanceExact(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long nextOrd() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public BytesRef lookupOrd(long ord) throws IOException {
return values[Math.toIntExact(ord)];
}
@Override
public long getValueCount() {
return values.length;
}
};
}
public void testMissingLongs() throws IOException {
final int numDocs = TestUtil.nextInt(random(), 1, 100);
final int[][] values = new int[numDocs][];