mirror of https://github.com/apache/nifi.git
NIFI-10576 Added onFinishRecordSet implementation for WriteParquetResult
This closes #6517 Co-authored-by: David Handermann <exceptionfactory@apache.org> Signed-off-by: David Handermann <exceptionfactory@apache.org>
This commit is contained in:
parent
ac8e3dae58
commit
2fa82179a8
|
@ -107,7 +107,7 @@ public class ParquetRecordSetWriter extends SchemaRegistryRecordSetWriter implem
|
|||
throw new SchemaNotFoundException("Failed to compile Avro Schema", e);
|
||||
}
|
||||
|
||||
return new WriteParquetResult(avroSchema, out, parquetConfig, logger);
|
||||
return new WriteParquetResult(avroSchema, recordSchema, getSchemaAccessWriter(recordSchema, variables), out, parquetConfig, logger);
|
||||
|
||||
} catch (final SchemaNotFoundException e) {
|
||||
throw new ProcessException("Could not determine the Avro Schema to use for writing the content", e);
|
||||
|
|
|
@ -23,8 +23,10 @@ import org.apache.nifi.avro.AvroTypeUtil;
|
|||
import org.apache.nifi.logging.ComponentLog;
|
||||
import org.apache.nifi.parquet.stream.NifiParquetOutputFile;
|
||||
import org.apache.nifi.parquet.utils.ParquetConfig;
|
||||
import org.apache.nifi.schema.access.SchemaAccessWriter;
|
||||
import org.apache.nifi.serialization.AbstractRecordSetWriter;
|
||||
import org.apache.nifi.serialization.record.Record;
|
||||
import org.apache.nifi.serialization.record.RecordSchema;
|
||||
import org.apache.parquet.avro.AvroParquetWriter;
|
||||
import org.apache.parquet.hadoop.ParquetWriter;
|
||||
import org.apache.parquet.io.OutputFile;
|
||||
|
@ -41,17 +43,21 @@ public class WriteParquetResult extends AbstractRecordSetWriter {
|
|||
private final Schema schema;
|
||||
private final ParquetWriter<GenericRecord> parquetWriter;
|
||||
private final ComponentLog componentLogger;
|
||||
private SchemaAccessWriter accessWriter;
|
||||
private RecordSchema recordSchema;
|
||||
|
||||
public WriteParquetResult(final Schema schema, final OutputStream out, final ParquetConfig parquetConfig, final ComponentLog componentLogger) throws IOException {
|
||||
public WriteParquetResult(final Schema avroSchema, final RecordSchema recordSchema, final SchemaAccessWriter accessWriter, final OutputStream out,
|
||||
final ParquetConfig parquetConfig, final ComponentLog componentLogger) throws IOException {
|
||||
super(out);
|
||||
this.schema = schema;
|
||||
this.schema = avroSchema;
|
||||
this.componentLogger = componentLogger;
|
||||
this.accessWriter = accessWriter;
|
||||
this.recordSchema = recordSchema;
|
||||
|
||||
final Configuration conf = new Configuration();
|
||||
final OutputFile outputFile = new NifiParquetOutputFile(out);
|
||||
|
||||
final AvroParquetWriter.Builder<GenericRecord> writerBuilder =
|
||||
AvroParquetWriter.<GenericRecord>builder(outputFile).withSchema(schema);
|
||||
final AvroParquetWriter.Builder<GenericRecord> writerBuilder = AvroParquetWriter.<GenericRecord>builder(outputFile).withSchema(avroSchema);
|
||||
applyCommonConfig(writerBuilder, conf, parquetConfig);
|
||||
parquetWriter = writerBuilder.build();
|
||||
}
|
||||
|
@ -63,6 +69,11 @@ public class WriteParquetResult extends AbstractRecordSetWriter {
|
|||
return Collections.emptyMap();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, String> onFinishRecordSet() {
|
||||
return accessWriter.getAttributes(recordSchema);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
try {
|
||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.nifi.schema.access.SchemaAccessUtils;
|
|||
import org.apache.nifi.schema.access.SchemaNotFoundException;
|
||||
import org.apache.nifi.serialization.RecordSetWriter;
|
||||
import org.apache.nifi.serialization.SimpleRecordSchema;
|
||||
import org.apache.nifi.serialization.WriteResult;
|
||||
import org.apache.nifi.serialization.record.MapRecord;
|
||||
import org.apache.nifi.serialization.record.Record;
|
||||
import org.apache.nifi.serialization.record.RecordSchema;
|
||||
|
@ -55,6 +56,7 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
|
||||
public class TestParquetRecordSetWriter {
|
||||
|
||||
|
@ -73,16 +75,17 @@ public class TestParquetRecordSetWriter {
|
|||
|
||||
@Test
|
||||
public void testWriteUsers() throws IOException, SchemaNotFoundException, InitializationException {
|
||||
initRecordSetWriter();
|
||||
initRecordSetWriter(true);
|
||||
final RecordSchema writeSchema = recordSetWriterFactory.getSchema(Collections.emptyMap(), null);
|
||||
final File parquetFile = new File("target/testWriterUsers-" + System.currentTimeMillis());
|
||||
writeUsers(writeSchema, parquetFile);
|
||||
final WriteResult writeResult = writeUsers(writeSchema, parquetFile);
|
||||
assertWriteAttributesFound(writeResult);
|
||||
verifyParquetRecords(parquetFile);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteUsersWhenSchemaFormatNotAvro() throws IOException, SchemaNotFoundException, InitializationException {
|
||||
initRecordSetWriter();
|
||||
initRecordSetWriter(false);
|
||||
final RecordSchema writeSchema = recordSetWriterFactory.getSchema(Collections.emptyMap(), null);
|
||||
final RecordSchema writeSchemaWithOtherFormat = new SimpleRecordSchema(writeSchema.getFields(), null, "OTHER-FORMAT", SchemaIdentifier.EMPTY);
|
||||
final File parquetFile = new File("target/testWriterUsers-" + System.currentTimeMillis());
|
||||
|
@ -90,7 +93,7 @@ public class TestParquetRecordSetWriter {
|
|||
verifyParquetRecords(parquetFile);
|
||||
}
|
||||
|
||||
private void initRecordSetWriter() throws IOException, InitializationException {
|
||||
private void initRecordSetWriter(final boolean writeSchemaNameStrategy) throws IOException, InitializationException {
|
||||
final TestRunner runner = TestRunners.newTestRunner(new AbstractProcessor() {
|
||||
@Override
|
||||
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
|
||||
|
@ -103,12 +106,18 @@ public class TestParquetRecordSetWriter {
|
|||
final Map<PropertyDescriptor, String> properties = createPropertiesWithSchema(schemaFile);
|
||||
properties.forEach((k, v) -> runner.setProperty(recordSetWriterFactory, k, v));
|
||||
|
||||
if (writeSchemaNameStrategy) {
|
||||
runner.setProperty(recordSetWriterFactory, "Schema Write Strategy", "schema-name");
|
||||
}
|
||||
|
||||
runner.enableControllerService(recordSetWriterFactory);
|
||||
}
|
||||
|
||||
private void writeUsers(final RecordSchema writeSchema, final File parquetFile) throws IOException {
|
||||
private WriteResult writeUsers(final RecordSchema writeSchema, final File parquetFile) throws IOException {
|
||||
final WriteResult writeResult;
|
||||
try(final OutputStream output = new FileOutputStream(parquetFile);
|
||||
final RecordSetWriter recordSetWriter = recordSetWriterFactory.createWriter(componentLog, writeSchema, output, Collections.emptyMap())) {
|
||||
recordSetWriter.beginRecordSet();
|
||||
for (int i = 0; i < USERS; i++) {
|
||||
final Map<String, Object> userFields = new HashMap<>();
|
||||
userFields.put("name", "user" + i);
|
||||
|
@ -120,7 +129,9 @@ public class TestParquetRecordSetWriter {
|
|||
}
|
||||
|
||||
recordSetWriter.flush();
|
||||
writeResult = recordSetWriter.finishRecordSet();
|
||||
}
|
||||
return writeResult;
|
||||
}
|
||||
|
||||
private void verifyParquetRecords(final File parquetFile) throws IOException {
|
||||
|
@ -149,4 +160,9 @@ public class TestParquetRecordSetWriter {
|
|||
propertyValues.put(SchemaAccessUtils.SCHEMA_TEXT, schemaText);
|
||||
return propertyValues;
|
||||
}
|
||||
|
||||
private void assertWriteAttributesFound(final WriteResult writeResult) {
|
||||
final Map<String, String> attributes = writeResult.getAttributes();
|
||||
assertFalse(attributes.isEmpty(), "Write Attributes not found");
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue