adjust topn heap operation when string is dictionary encoded, but not uniquely (#12291)

* add topn heap optimization when string is dictionary encoded, but not uniquely

* use array instead

* is same

* fix javadoc

* fix

* Update StringTopNColumnAggregatesProcessor.java
This commit is contained in:
Clint Wylie 2022-03-08 14:32:40 -08:00 committed by GitHub
parent 0e097ead36
commit dae53ae36a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -52,9 +52,8 @@ public class StringTopNColumnAggregatesProcessor implements TopNColumnAggregates
@Override
public int getCardinality(DimensionSelector selector)
{
// only report the underlying selector cardinality if the column the selector is for is dictionary encoded, and
// the dictionary values are unique, that is they have a 1:1 mapping between dictionaryId and column value
if (capabilities.isDictionaryEncoded().and(capabilities.areDictionaryValuesUnique()).isTrue()) {
// only report the underlying selector cardinality if the column the selector is for is dictionary encoded
if (capabilities.isDictionaryEncoded().isTrue()) {
return selector.getValueCardinality();
}
return DimensionDictionarySelector.CARDINALITY_UNKNOWN;
@ -117,17 +116,11 @@ public class StringTopNColumnAggregatesProcessor implements TopNColumnAggregates
)
{
final boolean notUnknown = selector.getValueCardinality() != DimensionDictionarySelector.CARDINALITY_UNKNOWN;
final boolean unique = capabilities.isDictionaryEncoded().and(capabilities.areDictionaryValuesUnique()).isTrue();
// we must know cardinality to use array based aggregation
// we check for uniquely dictionary encoded values because non-unique (meaning dictionary ids do not have a 1:1
// relation with values) negates many of the benefits of array aggregation:
// - if different dictionary ids map to the same value but dictionary ids are unique to that value (*:1), then
// array aggregation will be correct but will still have to potentially perform many map lookups and lose the
// performance benefit array aggregation is trying to provide
// - in cases where the same dictionary ids map to different values (1:* or *:*), results can be entirely
// incorrect since an aggregator for a different value might be chosen from the array based on the re-used
// dictionary id
if (notUnknown && unique) {
final boolean hasDictionary = capabilities.isDictionaryEncoded().isTrue();
// we must know cardinality to use array based aggregation. in cases where the same dictionary ids map to different
// values (1:* or *:*), results can be entirely incorrect since an aggregator for a different value might be
// chosen from the array based on the re-used dictionary id
if (notUnknown && hasDictionary) {
return scanAndAggregateWithCardinalityKnown(query, cursor, selector, rowSelector);
} else {
return scanAndAggregateWithCardinalityUnknown(query, cursor, selector);
@ -140,6 +133,11 @@ public class StringTopNColumnAggregatesProcessor implements TopNColumnAggregates
this.aggregatesStore = new HashMap<>();
}
/**
* scan and aggregate when column is dictionary encoded and value cardinality is known up front, so values are
* aggregated into an array position specified by the dictionaryid, which if not already present, are translated
* into the key and fetched (or created if they key hasn't been encountered) from the {@link #aggregatesStore}
*/
private long scanAndAggregateWithCardinalityKnown(
TopNQuery query,
Cursor cursor,
@ -172,6 +170,14 @@ public class StringTopNColumnAggregatesProcessor implements TopNColumnAggregates
return processedRows;
}
/**
* this method is to allow scan and aggregate when values are not dictionary encoded
* (e.g. {@link DimensionSelector#nameLookupPossibleInAdvance()} is false and/or when
* {@link ColumnCapabilities#isDictionaryEncoded()} is false). This mode also uses hash table aggregation, storing
* results in {@link #aggregatesStore}, and must call {@link DimensionSelector#lookupName(int)} for every row which
* is processed and cannot cache lookups, or use the dictionary id in any way other than to lookup the current row
* value.
*/
private long scanAndAggregateWithCardinalityUnknown(
TopNQuery query,
Cursor cursor,