Fix incorrect scale when reading decimal from parquet (#15715)

* Fix incorrect scale when reading decimal from parquet

* add comments

* fix test
This commit is contained in:
Maytas Monsereenusorn 2024-01-18 02:10:27 -08:00 committed by GitHub
parent a3b32fbd26
commit 55acf2e2ff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 193 additions and 12 deletions

View File

@ -426,9 +426,13 @@ public class ParquetGroupConverter
int scale = pt.asPrimitiveType().getDecimalMetadata().getScale();
switch (pt.getPrimitiveTypeName()) {
case INT32:
return new BigDecimal(g.getInteger(fieldIndex, index));
// The primitive returned from Group is an unscaledValue.
// We need to do unscaledValue * 10^(-scale) to convert back to decimal
return new BigDecimal(g.getInteger(fieldIndex, index)).movePointLeft(scale);
case INT64:
return new BigDecimal(g.getLong(fieldIndex, index));
// The primitive returned from Group is an unscaledValue.
// We need to do unscaledValue * 10^(-scale) to convert back to decimal
return new BigDecimal(g.getLong(fieldIndex, index)).movePointLeft(scale);
case FIXED_LEN_BYTE_ARRAY:
case BINARY:
Binary value = g.getBinary(fieldIndex, index);

View File

@ -62,6 +62,35 @@ public class DecimalParquetInputTest extends BaseParquetInputTest
parserType,
true
);
/*
The raw data in the parquet file has the following columns:
############ Column(fixed_len_dec) ############
name: fixed_len_dec
path: fixed_len_dec
max_definition_level: 1
max_repetition_level: 0
physical_type: FIXED_LEN_BYTE_ARRAY
logical_type: Decimal(precision=10, scale=2)
converted_type (legacy): DECIMAL
The raw data in the parquet file has the following rows:
0.0
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
0.0
1.0
2.0
3.0
4.0
5.0
*/
List<InputRow> rows = getAllRows(parserType, config);
Assert.assertEquals("2018-09-01T00:00:00.000Z", rows.get(0).getTimestamp().toString());
Assert.assertEquals("1.0", rows.get(0).getDimension("fixed_len_dec").get(0));
@ -80,10 +109,39 @@ public class DecimalParquetInputTest extends BaseParquetInputTest
parserType,
true
);
/*
The raw data in the parquet file has the following columns:
############ Column(i32_dec) ############
name: i32_dec
path: i32_dec
max_definition_level: 1
max_repetition_level: 0
physical_type: INT32
logical_type: Decimal(precision=5, scale=2)
converted_type (legacy): DECIMAL
The raw data in the parquet file has the following rows:
0
1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
0
1.00
2.00
3.00
4.00
5.00
*/
List<InputRow> rows = getAllRows(parserType, config);
Assert.assertEquals("2018-09-01T00:00:00.000Z", rows.get(0).getTimestamp().toString());
Assert.assertEquals("100", rows.get(0).getDimension("i32_dec").get(0));
Assert.assertEquals(new BigDecimal(100), rows.get(0).getMetric("metric1"));
Assert.assertEquals("1.00", rows.get(0).getDimension("i32_dec").get(0));
Assert.assertEquals(BigDecimal.valueOf(100L, 2), rows.get(0).getMetric("metric1"));
}
@Test
@ -98,9 +156,38 @@ public class DecimalParquetInputTest extends BaseParquetInputTest
parserType,
true
);
/*
The raw data in the parquet file has the following columns:
############ Column(i64_dec) ############
name: i64_dec
path: i64_dec
max_definition_level: 1
max_repetition_level: 0
physical_type: INT64
logical_type: Decimal(precision=10, scale=2)
converted_type (legacy): DECIMAL
The raw data in the parquet file has the following rows:
0
1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
0
1.00
2.00
3.00
4.00
5.00
*/
List<InputRow> rows = getAllRows(parserType, config);
Assert.assertEquals("2018-09-01T00:00:00.000Z", rows.get(0).getTimestamp().toString());
Assert.assertEquals("100", rows.get(0).getDimension("i64_dec").get(0));
Assert.assertEquals(new BigDecimal(100), rows.get(0).getMetric("metric1"));
Assert.assertEquals("1.00", rows.get(0).getDimension("i64_dec").get(0));
Assert.assertEquals(BigDecimal.valueOf(100L, 2), rows.get(0).getMetric("metric1"));
}
}

View File

@ -63,6 +63,36 @@ public class DecimalParquetReaderTest extends BaseParquetReaderTest
flattenSpec
);
/*
The raw data in the parquet file has the following columns:
############ Column(fixed_len_dec) ############
name: fixed_len_dec
path: fixed_len_dec
max_definition_level: 1
max_repetition_level: 0
physical_type: FIXED_LEN_BYTE_ARRAY
logical_type: Decimal(precision=10, scale=2)
converted_type (legacy): DECIMAL
The raw data in the parquet file has the following rows:
0.0
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
0.0
1.0
2.0
3.0
4.0
5.0
*/
List<InputRow> rows = readAllRows(reader);
Assert.assertEquals("2018-09-01T00:00:00.000Z", rows.get(1).getTimestamp().toString());
Assert.assertEquals("1.0", rows.get(1).getDimension("fixed_len_dec").get(0));
@ -100,10 +130,40 @@ public class DecimalParquetReaderTest extends BaseParquetReaderTest
flattenSpec
);
/*
The raw data in the parquet file has the following columns:
############ Column(i32_dec) ############
name: i32_dec
path: i32_dec
max_definition_level: 1
max_repetition_level: 0
physical_type: INT32
logical_type: Decimal(precision=5, scale=2)
converted_type (legacy): DECIMAL
The raw data in the parquet file has the following rows:
0
1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
0
1.00
2.00
3.00
4.00
5.00
*/
List<InputRow> rows = readAllRows(reader);
Assert.assertEquals("2018-09-01T00:00:00.000Z", rows.get(1).getTimestamp().toString());
Assert.assertEquals("100", rows.get(1).getDimension("i32_dec").get(0));
Assert.assertEquals(new BigDecimal(100), rows.get(1).getMetric("metric1"));
Assert.assertEquals("1.00", rows.get(1).getDimension("i32_dec").get(0));
Assert.assertEquals(BigDecimal.valueOf(100L, 2), rows.get(1).getMetric("metric1"));
reader = createReader(
file,
@ -112,7 +172,7 @@ public class DecimalParquetReaderTest extends BaseParquetReaderTest
);
List<InputRowListPlusRawValues> sampled = sampleAllRows(reader);
final String expectedJson = "{\n"
+ " \"i32_dec\" : 100\n"
+ " \"i32_dec\" : 1.00\n"
+ "}";
Assert.assertEquals(expectedJson, DEFAULT_JSON_WRITER.writeValueAsString(sampled.get(1).getRawValues()));
}
@ -137,10 +197,40 @@ public class DecimalParquetReaderTest extends BaseParquetReaderTest
flattenSpec
);
/*
The raw data in the parquet file has the following columns:
############ Column(i64_dec) ############
name: i64_dec
path: i64_dec
max_definition_level: 1
max_repetition_level: 0
physical_type: INT64
logical_type: Decimal(precision=10, scale=2)
converted_type (legacy): DECIMAL
The raw data in the parquet file has the following rows:
0
1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
0
1.00
2.00
3.00
4.00
5.00
*/
List<InputRow> rows = readAllRows(reader);
Assert.assertEquals("2018-09-01T00:00:00.000Z", rows.get(1).getTimestamp().toString());
Assert.assertEquals("100", rows.get(1).getDimension("i64_dec").get(0));
Assert.assertEquals(new BigDecimal(100), rows.get(1).getMetric("metric1"));
Assert.assertEquals("1.00", rows.get(1).getDimension("i64_dec").get(0));
Assert.assertEquals(BigDecimal.valueOf(100L, 2), rows.get(1).getMetric("metric1"));
reader = createReader(
file,
@ -149,7 +239,7 @@ public class DecimalParquetReaderTest extends BaseParquetReaderTest
);
List<InputRowListPlusRawValues> sampled = sampleAllRows(reader);
final String expectedJson = "{\n"
+ " \"i64_dec\" : 100\n"
+ " \"i64_dec\" : 1.00\n"
+ "}";
Assert.assertEquals(expectedJson, DEFAULT_JSON_WRITER.writeValueAsString(sampled.get(1).getRawValues()));
}