Fix Parquet Reader for schema-less ingestion need to read all columns (#13689)

* fix stuff * address comments
2023-01-18 10:52:12 -10:00 · 2023-01-18 10:52:12 -10:00 · 1582d74f37
parent fa493f1ebc
commit 1582d74f37
2 changed files with 50 additions and 3 deletions
--- a/server/src/main/java/org/apache/druid/segment/indexing/ReaderUtils.java
+++ b/server/src/main/java/org/apache/druid/segment/indexing/ReaderUtils.java
@ -60,6 +60,11 @@ public class ReaderUtils
    // Find columns we need to read from the flattenSpec
    if (flattenSpec != null) {
      if (dimensionsSpec.getDimensions().isEmpty() && flattenSpec.isUseFieldDiscovery()) {
        // Schemaless ingestion with useFieldDiscovery needs to read all columns
        return fullInputSchema;
      }
      // Parse columns needed from flattenSpec
      for (JSONPathFieldSpec fields : flattenSpec.getFields()) {
        if (fields.getType() == JSONPathFieldType.ROOT) {
@ -117,21 +122,27 @@ public class ReaderUtils
        fieldsRequired.retainAll(fullInputSchema);
        return fieldsRequired;
      }
    } else {
      // Without flattenSpec, useFieldDiscovery is default to true and thus needs to read all columns since this is
      // schemaless
      if (dimensionsSpec.getDimensions().isEmpty()) {
        return fullInputSchema;
      }
    }
-    // Determine any fields we need to read from parquet file that is used in the transformSpec
+    // Determine any fields we need to read from input file that is used in the transformSpec
    List<Transform> transforms = transformSpec.getTransforms();
    for (Transform transform : transforms) {
      fieldsRequired.addAll(transform.getRequiredColumns());
    }
-    // Determine any fields we need to read from parquet file that is used in the dimensionsSpec
+    // Determine any fields we need to read from input file that is used in the dimensionsSpec
    List<DimensionSchema> dimensionSchema = dimensionsSpec.getDimensions();
    for (DimensionSchema dim : dimensionSchema) {
      fieldsRequired.add(dim.getName());
    }
-    // Determine any fields we need to read from parquet file that is used in the metricsSpec
+    // Determine any fields we need to read from input file that is used in the metricsSpec
    for (AggregatorFactory agg : aggregators) {
      fieldsRequired.addAll(agg.requiredFields());
    }
--- a/server/src/test/java/org/apache/druid/segment/indexing/ReaderUtilsTest.java
+++ b/server/src/test/java/org/apache/druid/segment/indexing/ReaderUtilsTest.java
@ -328,4 +328,40 @@ public class ReaderUtilsTest extends InitializedNullHandlingTest
    Set<String> actual = ReaderUtils.getColumnsRequiredForIngestion(fullInputSchema, timestampSpec, dimensionsSpec, TransformSpec.NONE, new AggregatorFactory[]{}, flattenSpec);
    Assert.assertEquals(ImmutableSet.of("B", "C"), actual);
  }
  @Test
  public void testGetColumnsRequiredForSchemalessIngestionWithoutFlattenSpec()
  {
    TimestampSpec timestampSpec = new TimestampSpec("A", "iso", null);
    DimensionsSpec dimensionsSpec = DimensionsSpec.EMPTY;
    Set<String> actual = ReaderUtils.getColumnsRequiredForIngestion(fullInputSchema, timestampSpec, dimensionsSpec, TransformSpec.NONE, new AggregatorFactory[]{}, null);
    Assert.assertEquals(fullInputSchema, actual);
  }
  @Test
  public void testGetColumnsRequiredForSchemalessIngestionWithFlattenSpecAndUseFieldDiscovery()
  {
    TimestampSpec timestampSpec = new TimestampSpec("A", "iso", null);
    DimensionsSpec dimensionsSpec = DimensionsSpec.EMPTY;
    List<JSONPathFieldSpec> flattenExpr = ImmutableList.of(
        new JSONPathFieldSpec(JSONPathFieldType.PATH, "CFlat", "$.C.time")
    );
    JSONPathSpec flattenSpec = new JSONPathSpec(true, flattenExpr);
    Set<String> actual = ReaderUtils.getColumnsRequiredForIngestion(fullInputSchema, timestampSpec, dimensionsSpec, TransformSpec.NONE, new AggregatorFactory[]{}, flattenSpec);
    Assert.assertEquals(fullInputSchema, actual);
  }
  @Test
  public void testGetColumnsRequiredForSchemalessIngestionWithFlattenSpecAndNotUseFieldDiscovery()
  {
    TimestampSpec timestampSpec = new TimestampSpec("A", "iso", null);
    DimensionsSpec dimensionsSpec = DimensionsSpec.EMPTY;
    List<JSONPathFieldSpec> flattenExpr = ImmutableList.of(
        new JSONPathFieldSpec(JSONPathFieldType.PATH, "CFlat", "$.C.time")
    );
    JSONPathSpec flattenSpec = new JSONPathSpec(false, flattenExpr);
    Set<String> actual = ReaderUtils.getColumnsRequiredForIngestion(fullInputSchema, timestampSpec, dimensionsSpec, TransformSpec.NONE, new AggregatorFactory[]{}, flattenSpec);
    Assert.assertEquals(ImmutableSet.of("A", "C"), actual);
  }
 }