From 9ed8beca5e800301912c42364a208dbc93f629ae Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 11 Apr 2023 20:49:52 -0700 Subject: [PATCH] bug fixes and add support for boolean inputs to classic long dimension indexer (#14069) changes: * adds support for boolean inputs to the classic long dimension indexer, which plays nice with LONG being the semi official boolean type in Druid, and even nicer when druid.expressions.useStrictBooleans is set to true, since the sampler when using the new 'auto' schema when 'useSchemaDiscovery' is specified on the dimensions spec will call the type out as LONG * fix bugs with sampler response and new schema discovery stuff incorrectly using classic 'json' type for the logical schema instead of the new 'auto' type --- .../InputSourceSamplerDiscoveryTest.java | 186 ++++++++++++++++++ .../data/input/impl/DimensionSchema.java | 4 +- .../druid/data/input/impl/DimensionsSpec.java | 1 - .../druid/segment/DimensionHandlerUtils.java | 3 + .../incremental/IncrementalIndexTest.java | 116 ++++++++++- 5 files changed, 306 insertions(+), 4 deletions(-) create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerDiscoveryTest.java diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerDiscoveryTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerDiscoveryTest.java new file mode 100644 index 00000000000..c1a7ebdde18 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerDiscoveryTest.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.overlord.sampler; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.ImmutableList; +import org.apache.druid.client.indexing.SamplerResponse; +import org.apache.druid.data.input.InputSource; +import org.apache.druid.data.input.impl.DimensionsSpec; +import org.apache.druid.data.input.impl.DoubleDimensionSchema; +import org.apache.druid.data.input.impl.InlineInputSource; +import org.apache.druid.data.input.impl.JsonInputFormat; +import org.apache.druid.data.input.impl.LongDimensionSchema; +import org.apache.druid.data.input.impl.StringDimensionSchema; +import org.apache.druid.data.input.impl.TimestampSpec; +import org.apache.druid.jackson.DefaultObjectMapper; +import org.apache.druid.math.expr.ExpressionProcessing; +import org.apache.druid.segment.AutoTypeColumnSchema; +import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.column.RowSignature; +import org.apache.druid.segment.indexing.DataSchema; +import org.apache.druid.testing.InitializedNullHandlingTest; +import org.apache.logging.log4j.util.Strings; +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; + +public class InputSourceSamplerDiscoveryTest extends InitializedNullHandlingTest +{ + private static final ObjectMapper OBJECT_MAPPER = new DefaultObjectMapper(); + private static final List STR_JSON_ROWS = ImmutableList.of( + "{ \"t\": \"2019-04-22T12:00\", \"string\": \"a\", \"long\": 1, \"double\":1.1, \"bool\":true, \"variant\":\"a\", \"array\":[1, 2, 3], \"nested\": {\"x\":1, \"y\": 2.0}}", + "{ \"t\": \"2019-04-22T12:00\", \"string\": \"b\", \"long\": 2, \"double\":2.2, \"bool\":false, \"variant\": 1.0, \"array\":[4, 5, 6], \"nested\": {\"x\":2, \"y\": 4.0} }", + "{ \"t\": \"2019-04-22T12:01\", \"string\": null, \"long\": null, \"double\":3.3, \"bool\":null, \"variant\":2, \"array\":[7, 8, 9], \"nested\": {\"x\":3, \"y\": 6.0} }", + "{ \"t\": \"2019-04-22T12:00\", \"string\": \"c\", \"long\": 4, \"double\":4.4, \"bool\":true, \"variant\":\"3\", \"array\":[10, 11, 12], \"nested\": {\"x\":4, \"y\": 8.0} }", + "{ \"t\": \"2019-04-22T12:00\", \"string\": \"d\", \"long\": 5, \"double\":null, \"bool\":false, \"variant\":null, \"array\":[13, 14, 15], \"nested\": {\"x\":5, \"y\": 10.0} }", + "{ \"t\": \"bad_timestamp\", \"string\": \"e\", \"long\": 6, \"double\":6.6, \"bool\":true, \"variant\":\"4\", \"array\":[16, 17, 18], \"nested\": {\"x\":6, \"y\": 12.0} }" + ); + private InputSourceSampler inputSourceSampler = new InputSourceSampler(OBJECT_MAPPER); + + @Test + public void testDiscoveredTypes() + { + final InputSource inputSource = new InlineInputSource(Strings.join(STR_JSON_ROWS, '\n')); + final SamplerResponse response = inputSourceSampler.sample( + inputSource, + new JsonInputFormat(null, null, null, null, null), + new DataSchema( + "test", + new TimestampSpec("t", null, null), + DimensionsSpec.builder().useSchemaDiscovery(true).build(), + null, + null, + null + ), + null + ); + + Assert.assertEquals(6, response.getNumRowsRead()); + Assert.assertEquals(5, response.getNumRowsIndexed()); + Assert.assertEquals(6, response.getData().size()); + Assert.assertEquals( + ImmutableList.of( + new StringDimensionSchema("string"), + new LongDimensionSchema("long"), + new DoubleDimensionSchema("double"), + new StringDimensionSchema("bool"), + new AutoTypeColumnSchema("variant"), + new AutoTypeColumnSchema("array"), + new AutoTypeColumnSchema("nested") + ), + response.getLogicalDimensions() + ); + + Assert.assertEquals( + ImmutableList.of( + new AutoTypeColumnSchema("string"), + new AutoTypeColumnSchema("long"), + new AutoTypeColumnSchema("double"), + new AutoTypeColumnSchema("bool"), + new AutoTypeColumnSchema("variant"), + new AutoTypeColumnSchema("array"), + new AutoTypeColumnSchema("nested") + ), + response.getPhysicalDimensions() + ); + Assert.assertEquals( + RowSignature.builder() + .addTimeColumn() + .add("string", ColumnType.STRING) + .add("long", ColumnType.LONG) + .add("double", ColumnType.DOUBLE) + .add("bool", ColumnType.STRING) + .add("variant", ColumnType.NESTED_DATA) + .add("array", ColumnType.LONG_ARRAY) + .add("nested", ColumnType.NESTED_DATA) + .build(), + response.getLogicalSegmentSchema() + ); + } + + @Test + public void testDiscoveredTypesStrictBooleans() + { + try { + ExpressionProcessing.initializeForStrictBooleansTests(true); + final InputSource inputSource = new InlineInputSource(Strings.join(STR_JSON_ROWS, '\n')); + final SamplerResponse response = inputSourceSampler.sample( + inputSource, + new JsonInputFormat(null, null, null, null, null), + new DataSchema( + "test", + new TimestampSpec("t", null, null), + DimensionsSpec.builder().useSchemaDiscovery(true).build(), + null, + null, + null + ), + null + ); + + Assert.assertEquals(6, response.getNumRowsRead()); + Assert.assertEquals(5, response.getNumRowsIndexed()); + Assert.assertEquals(6, response.getData().size()); + Assert.assertEquals( + ImmutableList.of( + new StringDimensionSchema("string"), + new LongDimensionSchema("long"), + new DoubleDimensionSchema("double"), + new LongDimensionSchema("bool"), + new AutoTypeColumnSchema("variant"), + new AutoTypeColumnSchema("array"), + new AutoTypeColumnSchema("nested") + ), + response.getLogicalDimensions() + ); + + Assert.assertEquals( + ImmutableList.of( + new AutoTypeColumnSchema("string"), + new AutoTypeColumnSchema("long"), + new AutoTypeColumnSchema("double"), + new AutoTypeColumnSchema("bool"), + new AutoTypeColumnSchema("variant"), + new AutoTypeColumnSchema("array"), + new AutoTypeColumnSchema("nested") + ), + response.getPhysicalDimensions() + ); + Assert.assertEquals( + RowSignature.builder() + .addTimeColumn() + .add("string", ColumnType.STRING) + .add("long", ColumnType.LONG) + .add("double", ColumnType.DOUBLE) + .add("bool", ColumnType.LONG) + .add("variant", ColumnType.NESTED_DATA) + .add("array", ColumnType.LONG_ARRAY) + .add("nested", ColumnType.NESTED_DATA) + .build(), + response.getLogicalSegmentSchema() + ); + } + finally { + ExpressionProcessing.initializeForTests(); + } + } +} diff --git a/processing/src/main/java/org/apache/druid/data/input/impl/DimensionSchema.java b/processing/src/main/java/org/apache/druid/data/input/impl/DimensionSchema.java index 5c1e372c275..0009b767af9 100644 --- a/processing/src/main/java/org/apache/druid/data/input/impl/DimensionSchema.java +++ b/processing/src/main/java/org/apache/druid/data/input/impl/DimensionSchema.java @@ -68,8 +68,8 @@ public abstract class DimensionSchema case DOUBLE: return new DoubleDimensionSchema(name); default: - // the nested column indexer can handle any type - return new NestedDataDimensionSchema(name); + // the auto column indexer can handle any type + return new AutoTypeColumnSchema(name); } } diff --git a/processing/src/main/java/org/apache/druid/data/input/impl/DimensionsSpec.java b/processing/src/main/java/org/apache/druid/data/input/impl/DimensionsSpec.java index 2a73d3cb30b..553d5172b5f 100644 --- a/processing/src/main/java/org/apache/druid/data/input/impl/DimensionsSpec.java +++ b/processing/src/main/java/org/apache/druid/data/input/impl/DimensionsSpec.java @@ -300,7 +300,6 @@ public class DimensionsSpec private List dimensionExclusions; private List spatialDimensions; private boolean includeAllDimensions; - private boolean useSchemaDiscovery; public Builder setDimensions(List dimensions) diff --git a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java index 8009afd654a..2b79154aace 100644 --- a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java +++ b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java @@ -29,6 +29,7 @@ import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.guava.Comparators; import org.apache.druid.java.util.common.parsers.ParseException; +import org.apache.druid.math.expr.Evals; import org.apache.druid.query.ColumnSelectorPlus; import org.apache.druid.query.dimension.ColumnSelectorStrategy; import org.apache.druid.query.dimension.ColumnSelectorStrategyFactory; @@ -321,6 +322,8 @@ public final class DimensionHandlerUtils return (Long) valObj; } else if (valObj instanceof Number) { return ((Number) valObj).longValue(); + } else if (valObj instanceof Boolean) { + return Evals.asLong((Boolean) valObj); } else if (valObj instanceof String) { Long ret = DimensionHandlerUtils.getExactLongFromDecimalString((String) valObj); if (reportParseExceptions && ret == null) { diff --git a/processing/src/test/java/org/apache/druid/segment/incremental/IncrementalIndexTest.java b/processing/src/test/java/org/apache/druid/segment/incremental/IncrementalIndexTest.java index 9c2523d29ed..9171c075294 100644 --- a/processing/src/test/java/org/apache/druid/segment/incremental/IncrementalIndexTest.java +++ b/processing/src/test/java/org/apache/druid/segment/incremental/IncrementalIndexTest.java @@ -24,6 +24,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import org.apache.druid.data.input.MapBasedInputRow; +import org.apache.druid.data.input.Row; import org.apache.druid.data.input.impl.DimensionsSpec; import org.apache.druid.data.input.impl.DoubleDimensionSchema; import org.apache.druid.data.input.impl.FloatDimensionSchema; @@ -36,7 +37,10 @@ import org.apache.druid.query.aggregation.AggregatorFactory; import org.apache.druid.query.aggregation.CountAggregatorFactory; import org.apache.druid.query.aggregation.FilteredAggregatorFactory; import org.apache.druid.query.filter.SelectorDimFilter; +import org.apache.druid.segment.AutoTypeColumnSchema; import org.apache.druid.segment.CloserRule; +import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.nested.StructuredData; import org.apache.druid.testing.InitializedNullHandlingTest; import org.junit.Assert; import org.junit.Rule; @@ -46,6 +50,8 @@ import org.junit.runners.Parameterized; import java.util.Arrays; import java.util.Collection; +import java.util.Iterator; +import java.util.List; /** */ @@ -77,7 +83,14 @@ public class IncrementalIndexTest extends InitializedNullHandlingTest new StringDimensionSchema("string"), new FloatDimensionSchema("float"), new LongDimensionSchema("long"), - new DoubleDimensionSchema("double") + new DoubleDimensionSchema("double"), + new StringDimensionSchema("bool_string"), + new LongDimensionSchema("bool_long"), + new AutoTypeColumnSchema("bool_auto"), + new AutoTypeColumnSchema("array_string"), + new AutoTypeColumnSchema("array_double"), + new AutoTypeColumnSchema("array_long"), + new AutoTypeColumnSchema("nested") ) ); AggregatorFactory[] metrics = { @@ -320,4 +333,105 @@ public class IncrementalIndexTest extends InitializedNullHandlingTest Assert.assertEquals(1, index.size()); } + + @Test + public void testTypeHandling() throws IndexSizeExceededException + { + IncrementalIndex index = indexCreator.createIndex(); + + final List dims = Arrays.asList( + "string", + "float", + "long", + "double", + "bool_string", + "bool_long", + "bool_auto", + "array_string", + "array_long", + "array_double", + "nested" + ); + IncrementalIndexAddResult result = index.add( + new MapBasedInputRow( + 0, + dims, + ImmutableMap.builder() + .put("string", "a") + .put("float", 1.0) + .put("long", 1) + .put("double", 1.0) + .put("bool_string", true) + .put("bool_long", true) + .put("bool_auto", true) + .put("array_string", ImmutableList.of("a", "b", "c")) + .put("array_long", ImmutableList.of(1, 2, 3)) + .put("array_double", ImmutableList.of(1.1, 2.2, 3.3)) + .put("nested", ImmutableMap.of("x", 1, "y", ImmutableList.of("a", "b"))) + .build() + ) + ); + Assert.assertNull(result.getParseException()); + result = index.add( + new MapBasedInputRow( + 60_000, // next minute so non-rollup still orders iterator correctly + dims, + ImmutableMap.builder() + .put("string", "b") + .put("float", 2.0) + .put("long", 2) + .put("double", 2.0) + .put("bool_string", false) + .put("bool_long", false) + .put("bool_auto", false) + .put("array_string", ImmutableList.of("d", "e", "f")) + .put("array_long", ImmutableList.of(4, 5, 6)) + .put("array_double", ImmutableList.of(4.4, 5.5, 6.6)) + .put("nested", ImmutableMap.of("x", 2, "y", ImmutableList.of("c", "d"))) + .build() + ) + ); + Assert.assertNull(result.getParseException()); + + Assert.assertEquals(ColumnType.STRING, index.getColumnCapabilities("string").toColumnType()); + Assert.assertEquals(ColumnType.FLOAT, index.getColumnCapabilities("float").toColumnType()); + Assert.assertEquals(ColumnType.LONG, index.getColumnCapabilities("long").toColumnType()); + Assert.assertEquals(ColumnType.DOUBLE, index.getColumnCapabilities("double").toColumnType()); + Assert.assertEquals(ColumnType.STRING, index.getColumnCapabilities("bool_string").toColumnType()); + Assert.assertEquals(ColumnType.LONG, index.getColumnCapabilities("bool_long").toColumnType()); + // depends on value of 'druid.expressions.useStrictBooleans', current default is false which parses as strings + Assert.assertEquals(ColumnType.STRING, index.getColumnCapabilities("bool_auto").toColumnType()); + Assert.assertEquals(ColumnType.STRING_ARRAY, index.getColumnCapabilities("array_string").toColumnType()); + Assert.assertEquals(ColumnType.LONG_ARRAY, index.getColumnCapabilities("array_long").toColumnType()); + Assert.assertEquals(ColumnType.DOUBLE_ARRAY, index.getColumnCapabilities("array_double").toColumnType()); + Assert.assertEquals(ColumnType.NESTED_DATA, index.getColumnCapabilities("nested").toColumnType()); + + + Iterator rowIterator = index.iterator(); + Row row = rowIterator.next(); + Assert.assertEquals("a", row.getRaw("string")); + Assert.assertEquals(1.0f, row.getRaw("float")); + Assert.assertEquals(1L, row.getRaw("long")); + Assert.assertEquals(1.0, row.getRaw("double")); + Assert.assertEquals("true", row.getRaw("bool_string")); + Assert.assertEquals(1L, row.getRaw("bool_long")); + Assert.assertEquals(StructuredData.wrap(true), row.getRaw("bool_auto")); + Assert.assertEquals(StructuredData.wrap(ImmutableList.of("a", "b", "c")), row.getRaw("array_string")); + Assert.assertEquals(StructuredData.wrap(ImmutableList.of(1, 2, 3)), row.getRaw("array_long")); + Assert.assertEquals(StructuredData.wrap(ImmutableList.of(1.1, 2.2, 3.3)), row.getRaw("array_double")); + Assert.assertEquals(StructuredData.wrap(ImmutableMap.of("x", 1, "y", ImmutableList.of("a", "b"))), row.getRaw("nested")); + + row = rowIterator.next(); + Assert.assertEquals("b", row.getRaw("string")); + Assert.assertEquals(2.0f, row.getRaw("float")); + Assert.assertEquals(2L, row.getRaw("long")); + Assert.assertEquals(2.0, row.getRaw("double")); + Assert.assertEquals("false", row.getRaw("bool_string")); + Assert.assertEquals(0L, row.getRaw("bool_long")); + Assert.assertEquals(StructuredData.wrap(false), row.getRaw("bool_auto")); + Assert.assertEquals(StructuredData.wrap(ImmutableList.of("d", "e", "f")), row.getRaw("array_string")); + Assert.assertEquals(StructuredData.wrap(ImmutableList.of(4, 5, 6)), row.getRaw("array_long")); + Assert.assertEquals(StructuredData.wrap(ImmutableList.of(4.4, 5.5, 6.6)), row.getRaw("array_double")); + Assert.assertEquals(StructuredData.wrap(ImmutableMap.of("x", 2, "y", ImmutableList.of("c", "d"))), row.getRaw("nested")); + } }