preserve explicitly specified dimension schema in "logical" schema of sampler response (#14144)

This commit is contained in:
Clint Wylie 2023-04-23 08:58:05 -07:00 committed by GitHub
parent b95708f389
commit 887f8db1b5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 128 additions and 4 deletions

View File

@ -247,10 +247,19 @@ public class InputSourceSampler
if (!SamplerInputRow.SAMPLER_ORDERING_COLUMN.equals(dimensionDesc.getName())) { if (!SamplerInputRow.SAMPLER_ORDERING_COLUMN.equals(dimensionDesc.getName())) {
final ColumnType columnType = dimensionDesc.getCapabilities().toColumnType(); final ColumnType columnType = dimensionDesc.getCapabilities().toColumnType();
signatureBuilder.add(dimensionDesc.getName(), columnType); signatureBuilder.add(dimensionDesc.getName(), columnType);
// for now, use legacy types instead of standard type // use explicitly specified dimension schema if it exists
logicalDimensionSchemas.add( if (dataSchema != null &&
DimensionSchema.getDefaultSchemaForBuiltInType(dimensionDesc.getName(), dimensionDesc.getCapabilities()) dataSchema.getDimensionsSpec() != null &&
); dataSchema.getDimensionsSpec().getSchema(dimensionDesc.getName()) != null) {
logicalDimensionSchemas.add(dataSchema.getDimensionsSpec().getSchema(dimensionDesc.getName()));
} else {
logicalDimensionSchemas.add(
DimensionSchema.getDefaultSchemaForBuiltInType(
dimensionDesc.getName(),
dimensionDesc.getCapabilities()
)
);
}
physicalDimensionSchemas.add( physicalDimensionSchemas.add(
dimensionDesc.getIndexer().getFormat().getColumnSchema(dimensionDesc.getName()) dimensionDesc.getIndexer().getFormat().getColumnSchema(dimensionDesc.getName())
); );

View File

@ -33,6 +33,7 @@ import org.apache.druid.data.input.impl.TimestampSpec;
import org.apache.druid.jackson.DefaultObjectMapper; import org.apache.druid.jackson.DefaultObjectMapper;
import org.apache.druid.math.expr.ExpressionProcessing; import org.apache.druid.math.expr.ExpressionProcessing;
import org.apache.druid.segment.AutoTypeColumnSchema; import org.apache.druid.segment.AutoTypeColumnSchema;
import org.apache.druid.segment.NestedDataDimensionSchema;
import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.column.ColumnType;
import org.apache.druid.segment.column.RowSignature; import org.apache.druid.segment.column.RowSignature;
import org.apache.druid.segment.indexing.DataSchema; import org.apache.druid.segment.indexing.DataSchema;
@ -183,4 +184,118 @@ public class InputSourceSamplerDiscoveryTest extends InitializedNullHandlingTest
ExpressionProcessing.initializeForTests(); ExpressionProcessing.initializeForTests();
} }
} }
@Test
public void testTypesClassicDiscovery()
{
final InputSource inputSource = new InlineInputSource(Strings.join(STR_JSON_ROWS, '\n'));
final DataSchema dataSchema = new DataSchema(
"test",
new TimestampSpec("t", null, null),
DimensionsSpec.builder().build(),
null,
null,
null
);
final SamplerResponse response = inputSourceSampler.sample(
inputSource,
new JsonInputFormat(null, null, null, null, null),
dataSchema,
null
);
Assert.assertEquals(6, response.getNumRowsRead());
Assert.assertEquals(5, response.getNumRowsIndexed());
Assert.assertEquals(6, response.getData().size());
Assert.assertEquals(
ImmutableList.of(
new StringDimensionSchema("string"),
new StringDimensionSchema("long"),
new StringDimensionSchema("double"),
new StringDimensionSchema("bool"),
new StringDimensionSchema("variant"),
new StringDimensionSchema("array")
),
response.getLogicalDimensions()
);
Assert.assertEquals(
ImmutableList.of(
new StringDimensionSchema("string"),
new StringDimensionSchema("long"),
new StringDimensionSchema("double"),
new StringDimensionSchema("bool"),
new StringDimensionSchema("variant"),
new StringDimensionSchema("array")
),
response.getPhysicalDimensions()
);
Assert.assertEquals(
RowSignature.builder()
.addTimeColumn()
.add("string", ColumnType.STRING)
.add("long", ColumnType.STRING)
.add("double", ColumnType.STRING)
.add("bool", ColumnType.STRING)
.add("variant", ColumnType.STRING)
.add("array", ColumnType.STRING)
.build(),
response.getLogicalSegmentSchema()
);
}
@Test
public void testTypesNoDiscoveryExplicitSchema()
{
final InputSource inputSource = new InlineInputSource(Strings.join(STR_JSON_ROWS, '\n'));
final DataSchema dataSchema = new DataSchema(
"test",
new TimestampSpec("t", null, null),
DimensionsSpec.builder().setDimensions(
ImmutableList.of(new StringDimensionSchema("string"),
new LongDimensionSchema("long"),
new DoubleDimensionSchema("double"),
new StringDimensionSchema("bool"),
new NestedDataDimensionSchema("variant"),
new NestedDataDimensionSchema("array"),
new NestedDataDimensionSchema("nested")
)
).build(),
null,
null,
null
);
final SamplerResponse response = inputSourceSampler.sample(
inputSource,
new JsonInputFormat(null, null, null, null, null),
dataSchema,
null
);
Assert.assertEquals(6, response.getNumRowsRead());
Assert.assertEquals(5, response.getNumRowsIndexed());
Assert.assertEquals(6, response.getData().size());
Assert.assertEquals(
dataSchema.getDimensionsSpec().getDimensions(),
response.getLogicalDimensions()
);
Assert.assertEquals(
dataSchema.getDimensionsSpec().getDimensions(),
response.getPhysicalDimensions()
);
Assert.assertEquals(
RowSignature.builder()
.addTimeColumn()
.add("string", ColumnType.STRING)
.add("long", ColumnType.LONG)
.add("double", ColumnType.DOUBLE)
.add("bool", ColumnType.STRING)
.add("variant", ColumnType.NESTED_DATA)
.add("array", ColumnType.NESTED_DATA)
.add("nested", ColumnType.NESTED_DATA)
.build(),
response.getLogicalSegmentSchema()
);
}
} }