Upgrade delta kernel from 3.1.0 to 3.2.0 (#16513)

Upstream release: https://github.com/delta-io/delta/releases/tag/v3.2.0

- Upgrade kernel dependency to 3.2.0
- Notable breaking changes introduced in upstream that affects the Druid extension:
 - Rename TableClient -> Engine
 - Rename DefaultTableClient -> DefaultEngine
 - Exceptions moved to a separate package
 - Table.getPath() doesn't throw TableNotFoundException. Instead the exception is thrown
   when getting snapshot info from the Table object
This commit is contained in:
Abhishek Radhakrishnan 2024-05-29 10:46:30 -07:00 committed by GitHub
parent b3b62ac431
commit 75937c98e8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 67 additions and 68 deletions

View File

@ -35,7 +35,7 @@
<modelVersion>4.0.0</modelVersion>
<properties>
<delta-kernel.version>3.1.0</delta-kernel.version>
<delta-kernel.version>3.2.0</delta-kernel.version>
</properties>
<dependencies>

View File

@ -28,12 +28,12 @@ import io.delta.kernel.Scan;
import io.delta.kernel.ScanBuilder;
import io.delta.kernel.Snapshot;
import io.delta.kernel.Table;
import io.delta.kernel.TableNotFoundException;
import io.delta.kernel.client.TableClient;
import io.delta.kernel.data.ColumnarBatch;
import io.delta.kernel.data.FilteredColumnarBatch;
import io.delta.kernel.data.Row;
import io.delta.kernel.defaults.client.DefaultTableClient;
import io.delta.kernel.defaults.engine.DefaultEngine;
import io.delta.kernel.engine.Engine;
import io.delta.kernel.exceptions.TableNotFoundException;
import io.delta.kernel.expressions.Predicate;
import io.delta.kernel.internal.InternalScanFileUtils;
import io.delta.kernel.internal.data.ScanStateRow;
@ -120,7 +120,7 @@ public class DeltaInputSource implements SplittableInputSource<DeltaSplit>
/**
* Instantiates a {@link DeltaInputSourceReader} to read the Delta table rows. If a {@link DeltaSplit} is supplied,
* the Delta files and schema are obtained from it to instantiate the reader. Otherwise, a Delta table client is
* the Delta files and schema are obtained from it to instantiate the reader. Otherwise, the Delta engine is
* instantiated with the supplied configuration to read the table.
*
* @param inputRowSchema schema for {@link org.apache.druid.data.input.InputRow}
@ -134,40 +134,40 @@ public class DeltaInputSource implements SplittableInputSource<DeltaSplit>
File temporaryDirectory
)
{
final TableClient tableClient = createTableClient();
final Engine engine = createDeltaEngine();
try {
final List<CloseableIterator<FilteredColumnarBatch>> scanFileDataIters = new ArrayList<>();
if (deltaSplit != null) {
final Row scanState = deserialize(tableClient, deltaSplit.getStateRow());
final Row scanState = deserialize(engine, deltaSplit.getStateRow());
final StructType physicalReadSchema =
ScanStateRow.getPhysicalDataReadSchema(tableClient, scanState);
ScanStateRow.getPhysicalDataReadSchema(engine, scanState);
for (String file : deltaSplit.getFiles()) {
final Row scanFile = deserialize(tableClient, file);
final Row scanFile = deserialize(engine, file);
scanFileDataIters.add(
getTransformedDataIterator(tableClient, scanState, scanFile, physicalReadSchema, Optional.empty())
getTransformedDataIterator(engine, scanState, scanFile, physicalReadSchema, Optional.empty())
);
}
} else {
final Table table = Table.forPath(tableClient, tablePath);
final Snapshot latestSnapshot = table.getLatestSnapshot(tableClient);
final StructType fullSnapshotSchema = latestSnapshot.getSchema(tableClient);
final Table table = Table.forPath(engine, tablePath);
final Snapshot latestSnapshot = table.getLatestSnapshot(engine);
final StructType fullSnapshotSchema = latestSnapshot.getSchema(engine);
final StructType prunedSchema = pruneSchema(
fullSnapshotSchema,
inputRowSchema.getColumnsFilter()
);
final ScanBuilder scanBuilder = latestSnapshot.getScanBuilder(tableClient);
final ScanBuilder scanBuilder = latestSnapshot.getScanBuilder(engine);
if (filter != null) {
scanBuilder.withFilter(tableClient, filter.getFilterPredicate(fullSnapshotSchema));
scanBuilder.withFilter(engine, filter.getFilterPredicate(fullSnapshotSchema));
}
final Scan scan = scanBuilder.withReadSchema(tableClient, prunedSchema).build();
final CloseableIterator<FilteredColumnarBatch> scanFilesIter = scan.getScanFiles(tableClient);
final Row scanState = scan.getScanState(tableClient);
final Scan scan = scanBuilder.withReadSchema(engine, prunedSchema).build();
final CloseableIterator<FilteredColumnarBatch> scanFilesIter = scan.getScanFiles(engine);
final Row scanState = scan.getScanState(engine);
final StructType physicalReadSchema =
ScanStateRow.getPhysicalDataReadSchema(tableClient, scanState);
ScanStateRow.getPhysicalDataReadSchema(engine, scanState);
while (scanFilesIter.hasNext()) {
final FilteredColumnarBatch scanFileBatch = scanFilesIter.next();
@ -176,7 +176,7 @@ public class DeltaInputSource implements SplittableInputSource<DeltaSplit>
while (scanFileRows.hasNext()) {
final Row scanFile = scanFileRows.next();
scanFileDataIters.add(
getTransformedDataIterator(tableClient, scanState, scanFile, physicalReadSchema, scan.getRemainingFilter())
getTransformedDataIterator(engine, scanState, scanFile, physicalReadSchema, scan.getRemainingFilter())
);
}
}
@ -203,26 +203,26 @@ public class DeltaInputSource implements SplittableInputSource<DeltaSplit>
return Stream.of(new InputSplit<>(deltaSplit));
}
final TableClient tableClient = createTableClient();
final Engine engine = createDeltaEngine();
final Snapshot latestSnapshot;
final Table table = Table.forPath(engine, tablePath);
try {
final Table table = Table.forPath(tableClient, tablePath);
latestSnapshot = table.getLatestSnapshot(tableClient);
latestSnapshot = table.getLatestSnapshot(engine);
}
catch (TableNotFoundException e) {
throw InvalidInput.exception(e, "tablePath[%s] not found.", tablePath);
}
final StructType fullSnapshotSchema = latestSnapshot.getSchema(tableClient);
final StructType fullSnapshotSchema = latestSnapshot.getSchema(engine);
final ScanBuilder scanBuilder = latestSnapshot.getScanBuilder(tableClient);
final ScanBuilder scanBuilder = latestSnapshot.getScanBuilder(engine);
if (filter != null) {
scanBuilder.withFilter(tableClient, filter.getFilterPredicate(fullSnapshotSchema));
scanBuilder.withFilter(engine, filter.getFilterPredicate(fullSnapshotSchema));
}
final Scan scan = scanBuilder.withReadSchema(tableClient, fullSnapshotSchema).build();
final Scan scan = scanBuilder.withReadSchema(engine, fullSnapshotSchema).build();
// scan files iterator for the current snapshot
final CloseableIterator<FilteredColumnarBatch> scanFilesIterator = scan.getScanFiles(tableClient);
final CloseableIterator<FilteredColumnarBatch> scanFilesIterator = scan.getScanFiles(engine);
final Row scanState = scan.getScanState(tableClient);
final Row scanState = scan.getScanState(engine);
final String scanStateStr = RowSerde.serializeRowToJson(scanState);
Iterator<DeltaSplit> deltaSplitIterator = Iterators.transform(
@ -256,9 +256,9 @@ public class DeltaInputSource implements SplittableInputSource<DeltaSplit>
);
}
private Row deserialize(TableClient tableClient, String row)
private Row deserialize(Engine engine, String row)
{
return RowSerde.deserializeRowFromJson(tableClient, row);
return RowSerde.deserializeRowFromJson(engine, row);
}
/**
@ -285,17 +285,17 @@ public class DeltaInputSource implements SplittableInputSource<DeltaSplit>
}
/**
* @return a table client where the client is initialized with {@link Configuration} class that uses the class's
* @return a Delta engine initialized with {@link Configuration} class that uses the class's
* class loader instead of the context classloader. The latter by default doesn't know about the extension classes,
* so the table client cannot load runtime classes resulting in {@link ClassNotFoundException}.
* so the Delta engine cannot load runtime classes resulting in {@link ClassNotFoundException}.
*/
private TableClient createTableClient()
private Engine createDeltaEngine()
{
final ClassLoader currCtxClassloader = Thread.currentThread().getContextClassLoader();
try {
Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
final Configuration conf = new Configuration();
return DefaultTableClient.create(conf);
return DefaultEngine.create(conf);
}
finally {
Thread.currentThread().setContextClassLoader(currCtxClassloader);
@ -308,7 +308,7 @@ public class DeltaInputSource implements SplittableInputSource<DeltaSplit>
* SingleThreadedTableReader.java</a>.
*/
private CloseableIterator<FilteredColumnarBatch> getTransformedDataIterator(
final TableClient tableClient,
final Engine engine,
final Row scanState,
final Row scanFile,
final StructType physicalReadSchema,
@ -317,14 +317,14 @@ public class DeltaInputSource implements SplittableInputSource<DeltaSplit>
{
final FileStatus fileStatus = InternalScanFileUtils.getAddFileStatus(scanFile);
final CloseableIterator<ColumnarBatch> physicalDataIter = tableClient.getParquetHandler().readParquetFiles(
final CloseableIterator<ColumnarBatch> physicalDataIter = engine.getParquetHandler().readParquetFiles(
Utils.singletonCloseableIterator(fileStatus),
physicalReadSchema,
optionalPredicate
);
return Scan.transformPhysicalData(
tableClient,
engine,
scanState,
scanFile,
physicalDataIter

View File

@ -23,9 +23,9 @@ import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import io.delta.kernel.client.TableClient;
import io.delta.kernel.data.Row;
import io.delta.kernel.defaults.internal.data.DefaultJsonRow;
import io.delta.kernel.engine.Engine;
import io.delta.kernel.internal.util.VectorUtils;
import io.delta.kernel.types.ArrayType;
import io.delta.kernel.types.BooleanType;
@ -84,13 +84,12 @@ public class RowSerde
/**
* Utility method to deserialize a {@link Row} object from the JSON form.
*/
public static Row deserializeRowFromJson(TableClient tableClient, String jsonRowWithSchema)
public static Row deserializeRowFromJson(Engine engine, String jsonRowWithSchema)
{
try {
JsonNode jsonNode = OBJECT_MAPPER.readTree(jsonRowWithSchema);
JsonNode schemaNode = jsonNode.get("schema");
StructType schema =
tableClient.getJsonHandler().deserializeStructType(schemaNode.asText());
StructType schema = engine.getJsonHandler().deserializeStructType(schemaNode.asText());
return parseRowFromJsonWithSchema((ObjectNode) jsonNode.get("row"), schema);
}
catch (JsonProcessingException e) {

View File

@ -20,12 +20,12 @@
package org.apache.druid.delta.input;
import io.delta.kernel.Scan;
import io.delta.kernel.TableNotFoundException;
import io.delta.kernel.client.TableClient;
import io.delta.kernel.data.ColumnarBatch;
import io.delta.kernel.data.FilteredColumnarBatch;
import io.delta.kernel.data.Row;
import io.delta.kernel.defaults.client.DefaultTableClient;
import io.delta.kernel.defaults.engine.DefaultEngine;
import io.delta.kernel.engine.Engine;
import io.delta.kernel.exceptions.TableNotFoundException;
import io.delta.kernel.internal.InternalScanFileUtils;
import io.delta.kernel.internal.data.ScanStateRow;
import io.delta.kernel.internal.util.Utils;
@ -68,13 +68,13 @@ public class DeltaInputRowTest
final List<Map<String, Object>> expectedRows
) throws TableNotFoundException, IOException
{
final TableClient tableClient = DefaultTableClient.create(new Configuration());
final Scan scan = DeltaTestUtils.getScan(tableClient, deltaTablePath);
final Engine engine = DefaultEngine.create(new Configuration());
final Scan scan = DeltaTestUtils.getScan(engine, deltaTablePath);
final Row scanState = scan.getScanState(tableClient);
final StructType physicalReadSchema = ScanStateRow.getPhysicalDataReadSchema(tableClient, scanState);
final Row scanState = scan.getScanState(engine);
final StructType physicalReadSchema = ScanStateRow.getPhysicalDataReadSchema(engine, scanState);
final CloseableIterator<FilteredColumnarBatch> scanFileIter = scan.getScanFiles(tableClient);
final CloseableIterator<FilteredColumnarBatch> scanFileIter = scan.getScanFiles(engine);
int totalRecordCount = 0;
while (scanFileIter.hasNext()) {
final FilteredColumnarBatch scanFileBatch = scanFileIter.next();
@ -84,13 +84,13 @@ public class DeltaInputRowTest
final Row scanFile = scanFileRows.next();
final FileStatus fileStatus = InternalScanFileUtils.getAddFileStatus(scanFile);
final CloseableIterator<ColumnarBatch> physicalDataIter = tableClient.getParquetHandler().readParquetFiles(
final CloseableIterator<ColumnarBatch> physicalDataIter = engine.getParquetHandler().readParquetFiles(
Utils.singletonCloseableIterator(fileStatus),
physicalReadSchema,
Optional.empty()
);
final CloseableIterator<FilteredColumnarBatch> dataIter = Scan.transformPhysicalData(
tableClient,
engine,
scanState,
scanFile,
physicalDataIter

View File

@ -23,19 +23,19 @@ import io.delta.kernel.Scan;
import io.delta.kernel.ScanBuilder;
import io.delta.kernel.Snapshot;
import io.delta.kernel.Table;
import io.delta.kernel.TableNotFoundException;
import io.delta.kernel.client.TableClient;
import io.delta.kernel.engine.Engine;
import io.delta.kernel.exceptions.TableNotFoundException;
import io.delta.kernel.types.StructType;
public class DeltaTestUtils
{
public static Scan getScan(final TableClient tableClient, final String deltaTablePath) throws TableNotFoundException
public static Scan getScan(final Engine engine, final String deltaTablePath) throws TableNotFoundException
{
final Table table = Table.forPath(tableClient, deltaTablePath);
final Snapshot snapshot = table.getLatestSnapshot(tableClient);
final StructType readSchema = snapshot.getSchema(tableClient);
final ScanBuilder scanBuilder = snapshot.getScanBuilder(tableClient)
.withReadSchema(tableClient, readSchema);
final Table table = Table.forPath(engine, deltaTablePath);
final Snapshot snapshot = table.getLatestSnapshot(engine);
final StructType readSchema = snapshot.getSchema(engine);
final ScanBuilder scanBuilder = snapshot.getScanBuilder(engine)
.withReadSchema(engine, readSchema);
return scanBuilder.build();
}
}

View File

@ -20,11 +20,11 @@
package org.apache.druid.delta.input;
import io.delta.kernel.Scan;
import io.delta.kernel.TableNotFoundException;
import io.delta.kernel.data.Row;
import io.delta.kernel.defaults.client.DefaultTableClient;
import io.delta.kernel.defaults.engine.DefaultEngine;
import io.delta.kernel.exceptions.TableNotFoundException;
import org.apache.hadoop.conf.Configuration;
import org.junit.Assert;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;
@ -46,13 +46,13 @@ public class RowSerdeTest
@ParameterizedTest(name = "{index}:with context {0}")
public void testSerializeDeserializeRoundtrip(final String tablePath) throws TableNotFoundException
{
final DefaultTableClient tableClient = DefaultTableClient.create(new Configuration());
final Scan scan = DeltaTestUtils.getScan(tableClient, tablePath);
final Row scanState = scan.getScanState(tableClient);
final DefaultEngine engine = DefaultEngine.create(new Configuration());
final Scan scan = DeltaTestUtils.getScan(engine, tablePath);
final Row scanState = scan.getScanState(engine);
final String rowJson = RowSerde.serializeRowToJson(scanState);
final Row row = RowSerde.deserializeRowFromJson(tableClient, rowJson);
final Row row = RowSerde.deserializeRowFromJson(engine, rowJson);
Assert.assertEquals(scanState.getSchema(), row.getSchema());
Assertions.assertEquals(scanState.getSchema(), row.getSchema());
}
}