adjust topn heap operation when string is dictionary encoded, but not uniquely (#12291)

* add topn heap optimization when string is dictionary encoded, but not uniquely * use array instead * is same * fix javadoc * fix * Update StringTopNColumnAggregatesProcessor.java
2025-02-17 07:25:02 +00:00 · 2022-03-08 14:32:40 -08:00 · 2022-03-08 14:32:40 -08:00 · dae53ae36a
commit dae53ae36a
parent 0e097ead36
1 changed files with 20 additions and 14 deletions
--- a/processing/src/main/java/org/apache/druid/query/topn/types/StringTopNColumnAggregatesProcessor.java
+++ b/processing/src/main/java/org/apache/druid/query/topn/types/StringTopNColumnAggregatesProcessor.java
@ -52,9 +52,8 @@ public class StringTopNColumnAggregatesProcessor implements TopNColumnAggregates
  @Override
  public int getCardinality(DimensionSelector selector)
  {
-    // only report the underlying selector cardinality if the column the selector is for is dictionary encoded, and
-    // the dictionary values are unique, that is they have a 1:1 mapping between dictionaryId and column value
-    if (capabilities.isDictionaryEncoded().and(capabilities.areDictionaryValuesUnique()).isTrue()) {
+    // only report the underlying selector cardinality if the column the selector is for is dictionary encoded
+    if (capabilities.isDictionaryEncoded().isTrue()) {
      return selector.getValueCardinality();
    }
    return DimensionDictionarySelector.CARDINALITY_UNKNOWN;
@ -117,17 +116,11 @@ public class StringTopNColumnAggregatesProcessor implements TopNColumnAggregates
  )
  {
    final boolean notUnknown = selector.getValueCardinality() != DimensionDictionarySelector.CARDINALITY_UNKNOWN;
-    final boolean unique = capabilities.isDictionaryEncoded().and(capabilities.areDictionaryValuesUnique()).isTrue();
-    // we must know cardinality to use array based aggregation
-    // we check for uniquely dictionary encoded values because non-unique (meaning dictionary ids do not have a 1:1
-    // relation with values) negates many of the benefits of array aggregation:
-    // - if different dictionary ids map to the same value but dictionary ids are unique to that value (*:1), then
-    //   array aggregation will be correct but will still have to potentially perform many map lookups and lose the
-    //   performance benefit array aggregation is trying to provide
-    // - in cases where the same dictionary ids map to different values (1:* or *:*), results can be entirely
-    //   incorrect since an aggregator for a different value might be chosen from the array based on the re-used
-    //   dictionary id
-    if (notUnknown && unique) {
+    final boolean hasDictionary = capabilities.isDictionaryEncoded().isTrue();
+    // we must know cardinality to use array based aggregation. in cases where the same dictionary ids map to different
+    // values (1:* or *:*), results can be entirely incorrect since an aggregator for a different value might be
+    // chosen from the array based on the re-used dictionary id
+    if (notUnknown && hasDictionary) {
      return scanAndAggregateWithCardinalityKnown(query, cursor, selector, rowSelector);
    } else {
      return scanAndAggregateWithCardinalityUnknown(query, cursor, selector);
@ -140,6 +133,11 @@ public class StringTopNColumnAggregatesProcessor implements TopNColumnAggregates
    this.aggregatesStore = new HashMap<>();
  }

+  /**
+   * scan and aggregate when column is dictionary encoded and value cardinality is known up front, so values are
+   * aggregated into an array position specified by the dictionaryid, which if not already present, are translated
+   * into the key and fetched (or created if they key hasn't been encountered) from the {@link #aggregatesStore}
+   */
  private long scanAndAggregateWithCardinalityKnown(
      TopNQuery query,
      Cursor cursor,
@ -172,6 +170,14 @@ public class StringTopNColumnAggregatesProcessor implements TopNColumnAggregates
    return processedRows;
  }

+  /**
+   * this method is to allow scan and aggregate when values are not dictionary encoded
+   * (e.g. {@link DimensionSelector#nameLookupPossibleInAdvance()} is false and/or when
+   * {@link ColumnCapabilities#isDictionaryEncoded()} is false). This mode also uses hash table aggregation, storing
+   * results in {@link #aggregatesStore}, and must call {@link DimensionSelector#lookupName(int)} for every row which
+   * is processed and cannot cache lookups, or use the dictionary id in any way other than to lookup the current row
+   * value.
+   */
  private long scanAndAggregateWithCardinalityUnknown(
      TopNQuery query,
      Cursor cursor,