mirror of
https://github.com/apache/druid.git
synced 2025-02-17 07:25:02 +00:00
adjust topn heap operation when string is dictionary encoded, but not uniquely (#12291)
* add topn heap optimization when string is dictionary encoded, but not uniquely * use array instead * is same * fix javadoc * fix * Update StringTopNColumnAggregatesProcessor.java
This commit is contained in:
parent
0e097ead36
commit
dae53ae36a
@ -52,9 +52,8 @@ public class StringTopNColumnAggregatesProcessor implements TopNColumnAggregates
|
||||
@Override
|
||||
public int getCardinality(DimensionSelector selector)
|
||||
{
|
||||
// only report the underlying selector cardinality if the column the selector is for is dictionary encoded, and
|
||||
// the dictionary values are unique, that is they have a 1:1 mapping between dictionaryId and column value
|
||||
if (capabilities.isDictionaryEncoded().and(capabilities.areDictionaryValuesUnique()).isTrue()) {
|
||||
// only report the underlying selector cardinality if the column the selector is for is dictionary encoded
|
||||
if (capabilities.isDictionaryEncoded().isTrue()) {
|
||||
return selector.getValueCardinality();
|
||||
}
|
||||
return DimensionDictionarySelector.CARDINALITY_UNKNOWN;
|
||||
@ -117,17 +116,11 @@ public class StringTopNColumnAggregatesProcessor implements TopNColumnAggregates
|
||||
)
|
||||
{
|
||||
final boolean notUnknown = selector.getValueCardinality() != DimensionDictionarySelector.CARDINALITY_UNKNOWN;
|
||||
final boolean unique = capabilities.isDictionaryEncoded().and(capabilities.areDictionaryValuesUnique()).isTrue();
|
||||
// we must know cardinality to use array based aggregation
|
||||
// we check for uniquely dictionary encoded values because non-unique (meaning dictionary ids do not have a 1:1
|
||||
// relation with values) negates many of the benefits of array aggregation:
|
||||
// - if different dictionary ids map to the same value but dictionary ids are unique to that value (*:1), then
|
||||
// array aggregation will be correct but will still have to potentially perform many map lookups and lose the
|
||||
// performance benefit array aggregation is trying to provide
|
||||
// - in cases where the same dictionary ids map to different values (1:* or *:*), results can be entirely
|
||||
// incorrect since an aggregator for a different value might be chosen from the array based on the re-used
|
||||
// dictionary id
|
||||
if (notUnknown && unique) {
|
||||
final boolean hasDictionary = capabilities.isDictionaryEncoded().isTrue();
|
||||
// we must know cardinality to use array based aggregation. in cases where the same dictionary ids map to different
|
||||
// values (1:* or *:*), results can be entirely incorrect since an aggregator for a different value might be
|
||||
// chosen from the array based on the re-used dictionary id
|
||||
if (notUnknown && hasDictionary) {
|
||||
return scanAndAggregateWithCardinalityKnown(query, cursor, selector, rowSelector);
|
||||
} else {
|
||||
return scanAndAggregateWithCardinalityUnknown(query, cursor, selector);
|
||||
@ -140,6 +133,11 @@ public class StringTopNColumnAggregatesProcessor implements TopNColumnAggregates
|
||||
this.aggregatesStore = new HashMap<>();
|
||||
}
|
||||
|
||||
/**
|
||||
* scan and aggregate when column is dictionary encoded and value cardinality is known up front, so values are
|
||||
* aggregated into an array position specified by the dictionaryid, which if not already present, are translated
|
||||
* into the key and fetched (or created if they key hasn't been encountered) from the {@link #aggregatesStore}
|
||||
*/
|
||||
private long scanAndAggregateWithCardinalityKnown(
|
||||
TopNQuery query,
|
||||
Cursor cursor,
|
||||
@ -172,6 +170,14 @@ public class StringTopNColumnAggregatesProcessor implements TopNColumnAggregates
|
||||
return processedRows;
|
||||
}
|
||||
|
||||
/**
|
||||
* this method is to allow scan and aggregate when values are not dictionary encoded
|
||||
* (e.g. {@link DimensionSelector#nameLookupPossibleInAdvance()} is false and/or when
|
||||
* {@link ColumnCapabilities#isDictionaryEncoded()} is false). This mode also uses hash table aggregation, storing
|
||||
* results in {@link #aggregatesStore}, and must call {@link DimensionSelector#lookupName(int)} for every row which
|
||||
* is processed and cannot cache lookups, or use the dictionary id in any way other than to lookup the current row
|
||||
* value.
|
||||
*/
|
||||
private long scanAndAggregateWithCardinalityUnknown(
|
||||
TopNQuery query,
|
||||
Cursor cursor,
|
||||
|
Loading…
x
Reference in New Issue
Block a user