From 1eafe983eca409bdc5100a97797f542e08545a8b Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Fri, 8 Dec 2023 01:22:07 -0800 Subject: [PATCH] fix array presenting columns to not match single element arrays to scalars for equality (#15503) * fix array presenting columns to not match single element arrays to scalars for equality * update docs to clarify usage model of mixed type columns --- docs/ingestion/schema-design.md | 7 ++- .../druid/query/filter/EqualityFilter.java | 3 ++ .../nested/VariantColumnAndIndexSupplier.java | 3 ++ .../NestedDataTimeseriesQueryTest.java | 2 - .../segment/filter/EqualityFilterTests.java | 47 ++++++++++++++----- 5 files changed, 48 insertions(+), 14 deletions(-) diff --git a/docs/ingestion/schema-design.md b/docs/ingestion/schema-design.md index 0749d20b695..ce57b207fea 100644 --- a/docs/ingestion/schema-design.md +++ b/docs/ingestion/schema-design.md @@ -263,13 +263,18 @@ native boolean types, Druid ingests these values as longs if `druid.expressions. the [array functions](../querying/sql-array-functions.md) or [UNNEST](../querying/sql-functions.md#unnest). Nested columns can be queried with the [JSON functions](../querying/sql-json-functions.md). -Mixed type columns are stored in the _least_ restrictive type that can represent all values in the column. For example: +Mixed type columns follow the same rules for schema differences between segments, and present as the _least_ restrictive +type that can represent all values in the column. For example: - Mixed numeric columns are `DOUBLE` - If there are any strings present, then the column is a `STRING` - If there are arrays, then the column becomes an array with the least restrictive element type - Any nested data or arrays of nested data become `COMPLEX` nested columns. +Grouping, filtering, and aggregating mixed type values will handle these columns as if all values are represented as the +least restrictive type. The exception to this is the scan query, which will return the values in their original mixed +types, but any downstream operations on these values will still coerce them to the common type. + If you're already using string-based schema discovery and want to migrate, see [Migrating to type-aware schema discovery](#migrating-to-type-aware-schema-discovery). #### String-based schema discovery diff --git a/processing/src/main/java/org/apache/druid/query/filter/EqualityFilter.java b/processing/src/main/java/org/apache/druid/query/filter/EqualityFilter.java index ae7a5eaa134..f98a300b3dc 100644 --- a/processing/src/main/java/org/apache/druid/query/filter/EqualityFilter.java +++ b/processing/src/main/java/org/apache/druid/query/filter/EqualityFilter.java @@ -392,6 +392,9 @@ public class EqualityFilter extends AbstractOptimizableDimFilter implements Filt @Override public Predicate makeArrayPredicate(@Nullable TypeSignature arrayType) { + if (!matchValue.isArray()) { + return Predicates.alwaysFalse(); + } if (arrayType == null) { // fall back to per row detection if input array type is unknown return typeDetectingArrayPredicateSupplier.get(); diff --git a/processing/src/main/java/org/apache/druid/segment/nested/VariantColumnAndIndexSupplier.java b/processing/src/main/java/org/apache/druid/segment/nested/VariantColumnAndIndexSupplier.java index fcd2e4dca08..f81bffcd251 100644 --- a/processing/src/main/java/org/apache/druid/segment/nested/VariantColumnAndIndexSupplier.java +++ b/processing/src/main/java/org/apache/druid/segment/nested/VariantColumnAndIndexSupplier.java @@ -320,6 +320,9 @@ public class VariantColumnAndIndexSupplier implements Supplier valueType) { + if (!valueType.isArray()) { + return new AllFalseBitmapColumnIndex(bitmapFactory, nullValueBitmap); + } final ExprEval eval = ExprEval.ofType(ExpressionType.fromColumnTypeStrict(valueType), value); final ExprEval castForComparison = ExprEval.castForEqualityComparison( eval, diff --git a/processing/src/test/java/org/apache/druid/query/timeseries/NestedDataTimeseriesQueryTest.java b/processing/src/test/java/org/apache/druid/query/timeseries/NestedDataTimeseriesQueryTest.java index 130654d092c..815a6ed9951 100644 --- a/processing/src/test/java/org/apache/druid/query/timeseries/NestedDataTimeseriesQueryTest.java +++ b/processing/src/test/java/org/apache/druid/query/timeseries/NestedDataTimeseriesQueryTest.java @@ -488,7 +488,6 @@ public class NestedDataTimeseriesQueryTest extends InitializedNullHandlingTest .intervals(Collections.singletonList(Intervals.ETERNITY)) .filters( new AndDimFilter( - new EqualityFilter("variantWithArrays", ColumnType.STRING, "1", null), new EqualityFilter("v0", ColumnType.STRING, "1", null) ) ) @@ -524,7 +523,6 @@ public class NestedDataTimeseriesQueryTest extends InitializedNullHandlingTest .intervals(Collections.singletonList(Intervals.ETERNITY)) .filters( new AndDimFilter( - new EqualityFilter("variantWithArrays", ColumnType.DOUBLE, 3.0, null), new EqualityFilter("v0", ColumnType.DOUBLE, 3.0, null) ) ) diff --git a/processing/src/test/java/org/apache/druid/segment/filter/EqualityFilterTests.java b/processing/src/test/java/org/apache/druid/segment/filter/EqualityFilterTests.java index 5a4431e0056..801d02a97ee 100644 --- a/processing/src/test/java/org/apache/druid/segment/filter/EqualityFilterTests.java +++ b/processing/src/test/java/org/apache/druid/segment/filter/EqualityFilterTests.java @@ -412,22 +412,18 @@ public class EqualityFilterTests { if (isAutoSchema()) { // auto ingests arrays instead of strings - // single values are implicitly upcast to single element arrays, so we get some matches here... if (NullHandling.sqlCompatible()) { - assertFilterMatches(new EqualityFilter("dim2", ColumnType.STRING, "", null), ImmutableList.of("2")); + assertFilterMatches(new EqualityFilter("dim2", ColumnType.STRING, "", null), ImmutableList.of()); + assertFilterMatches(new EqualityFilter("dim2", ColumnType.STRING_ARRAY, ImmutableList.of(""), null), ImmutableList.of("2")); } - assertFilterMatches(new EqualityFilter("dim2", ColumnType.STRING, "a", null), ImmutableList.of("3")); + assertFilterMatches(new EqualityFilter("dim2", ColumnType.STRING, "a", null), ImmutableList.of()); + assertFilterMatches(new EqualityFilter("dim2", ColumnType.STRING_ARRAY, ImmutableList.of("a"), null), ImmutableList.of("3")); assertFilterMatches(new EqualityFilter("dim2", ColumnType.STRING, "b", null), ImmutableList.of()); - assertFilterMatches(new EqualityFilter("dim2", ColumnType.STRING, "c", null), ImmutableList.of("4")); + assertFilterMatches(new EqualityFilter("dim2", ColumnType.STRING, "c", null), ImmutableList.of()); + assertFilterMatches(new EqualityFilter("dim2", ColumnType.STRING_ARRAY, ImmutableList.of("c"), null), ImmutableList.of("4")); assertFilterMatches(new EqualityFilter("dim2", ColumnType.STRING, "d", null), ImmutableList.of()); // array matchers can match the whole array - if (NullHandling.sqlCompatible()) { - assertFilterMatches( - new EqualityFilter("dim2", ColumnType.STRING, ImmutableList.of(""), null), - ImmutableList.of("2") - ); - } assertFilterMatches( new EqualityFilter("dim2", ColumnType.STRING_ARRAY, new Object[]{"a", "b"}, null), ImmutableList.of("0") @@ -994,7 +990,7 @@ public class EqualityFilterTests "3", .. [1.1, 2.2, 3.3] "4", .. 12.34 "5", .. [100, 200, 300] - + */ Assume.assumeTrue(isAutoSchema()); assertFilterMatches( @@ -1018,6 +1014,7 @@ public class EqualityFilterTests ImmutableList.of("0", "1", "2", "3", "4", "5") ); + // variant columns must be matched as arrays if they contain any arrays assertFilterMatches( new EqualityFilter( "variant", @@ -1025,6 +1022,15 @@ public class EqualityFilterTests "abc", null ), + ImmutableList.of() + ); + assertFilterMatches( + new EqualityFilter( + "variant", + ColumnType.STRING_ARRAY, + ImmutableList.of("abc"), + null + ), ImmutableList.of("0") ); @@ -1035,6 +1041,15 @@ public class EqualityFilterTests 100L, null ), + ImmutableList.of() + ); + assertFilterMatches( + new EqualityFilter( + "variant", + ColumnType.LONG_ARRAY, + ImmutableList.of(100L), + null + ), ImmutableList.of("1", "2") ); @@ -1045,6 +1060,15 @@ public class EqualityFilterTests "100", null ), + ImmutableList.of() + ); + assertFilterMatches( + new EqualityFilter( + "variant", + ColumnType.STRING_ARRAY, + new Object[]{"100"}, + null + ), ImmutableList.of("1", "2") ); @@ -1255,6 +1279,7 @@ public class EqualityFilterTests "cachedOptimizedFilter" ) .withPrefabValues(ColumnType.class, ColumnType.STRING, ColumnType.DOUBLE) + .withPrefabValues(ExprEval.class, ExprEval.of("hello"), ExprEval.of(1.0)) .withIgnoredFields("predicateFactory", "cachedOptimizedFilter", "matchValue") .verify(); }