NIFI-2593: This closes #891. Fixed handling of nested records/structs in ConvertAvroToORC

This commit is contained in:
Matt Burgess 2016-08-17 20:38:15 -04:00 committed by joewitt
parent 4847ed28a1
commit 6874a5d82d
3 changed files with 153 additions and 2 deletions

View File

@ -165,10 +165,26 @@ public class NiFiOrcUtils {
}); });
return mapWritable; return mapWritable;
} }
if (o instanceof GenericData.Record) {
GenericData.Record record = (GenericData.Record) o;
TypeInfo recordSchema = NiFiOrcUtils.getOrcField(record.getSchema());
List<Schema.Field> recordFields = record.getSchema().getFields();
if (recordFields != null) {
Object[] fieldObjects = new Object[recordFields.size()];
for (int i = 0; i < recordFields.size(); i++) {
Schema.Field field = recordFields.get(i);
Schema fieldSchema = field.schema();
Object fieldObject = record.get(field.name());
fieldObjects[i] = NiFiOrcUtils.convertToORCObject(NiFiOrcUtils.getOrcField(fieldSchema), fieldObject);
} }
return NiFiOrcUtils.createOrcStruct(recordSchema, fieldObjects);
}
}
throw new IllegalArgumentException("Error converting object of type " + o.getClass().getName() + " to ORC type " + typeInfo.getTypeName());
} else {
return null; return null;
} }
}
/** /**

View File

@ -16,13 +16,16 @@
*/ */
package org.apache.nifi.processors.hive; package org.apache.nifi.processors.hive;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileWriter; import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumWriter; import org.apache.avro.io.DatumWriter;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.NiFiOrcUtils;
import org.apache.hadoop.hive.ql.io.orc.OrcFile; import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct; import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.hive.ql.io.orc.Reader; import org.apache.hadoop.hive.ql.io.orc.Reader;
@ -41,10 +44,14 @@ import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
@ -209,6 +216,96 @@ public class TestConvertAvroToORC {
assertNotNull(mapValue); assertNotNull(mapValue);
assertTrue(mapValue instanceof DoubleWritable); assertTrue(mapValue instanceof DoubleWritable);
assertEquals(2.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE); assertEquals(2.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);
}
@Test
public void test_onTrigger_array_of_records() throws Exception {
final Schema schema = new Schema.Parser().parse(new File("src/test/resources/array_of_records.avsc"));
List<GenericRecord> innerRecords = new LinkedList<>();
final GenericRecord outerRecord = new GenericData.Record(schema);
Schema arraySchema = schema.getField("records").schema();
Schema innerRecordSchema = arraySchema.getElementType();
final GenericRecord innerRecord1 = new GenericData.Record(innerRecordSchema);
innerRecord1.put("name", "Joe");
innerRecord1.put("age", 42);
innerRecords.add(innerRecord1);
final GenericRecord innerRecord2 = new GenericData.Record(innerRecordSchema);
innerRecord2.put("name", "Mary");
innerRecord2.put("age", 28);
innerRecords.add(innerRecord2);
GenericData.Array<GenericRecord> array = new GenericData.Array<>(arraySchema, innerRecords);
outerRecord.put("records", array);
final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
ByteArrayOutputStream out = new ByteArrayOutputStream();
try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
dataFileWriter.create(schema, out);
dataFileWriter.append(outerRecord);
}
out.close();
// Build a flow file from the Avro record
Map<String, String> attributes = new HashMap<String, String>() {{
put(CoreAttributes.FILENAME.key(), "test");
}};
runner.enqueue(out.toByteArray(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);
// Write the flow file out to disk, since the ORC Reader needs a path
MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS org_apache_nifi_outer_record " +
"(records ARRAY<STRUCT<name:STRING, age:INT>>)"
+ " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
assertEquals("1", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
FileOutputStream fos = new FileOutputStream("target/test1.orc");
fos.write(resultContents);
fos.flush();
fos.close();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
RecordReader rows = reader.rows();
Object o = rows.next(null);
assertNotNull(o);
assertTrue(o instanceof OrcStruct);
StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(NiFiOrcUtils.getOrcField(schema));
// Verify the record contains an array
Object arrayFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("records"));
assertTrue(arrayFieldObject instanceof ArrayList);
ArrayList<?> arrayField = (ArrayList<?>) arrayFieldObject;
assertEquals(2, arrayField.size());
// Verify the first element. Should be a record with two fields "name" and "age"
Object element = arrayField.get(0);
assertTrue(element instanceof OrcStruct);
StructObjectInspector elementInspector = (StructObjectInspector) OrcStruct.createObjectInspector(NiFiOrcUtils.getOrcField(innerRecordSchema));
Object nameObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("name"));
assertTrue(nameObject instanceof Text);
assertEquals("Joe", nameObject.toString());
Object ageObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("age"));
assertTrue(ageObject instanceof IntWritable);
assertEquals(42, ((IntWritable) ageObject).get());
// Verify the first element. Should be a record with two fields "name" and "age"
element = arrayField.get(1);
assertTrue(element instanceof OrcStruct);
nameObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("name"));
assertTrue(nameObject instanceof Text);
assertEquals("Mary", nameObject.toString());
ageObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("age"));
assertTrue(ageObject instanceof IntWritable);
assertEquals(28, ((IntWritable) ageObject).get());
} }
} }

View File

@ -0,0 +1,38 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{
"namespace" : "org.apache.nifi",
"name" : "outer_record",
"type" : "record",
"fields" : [ {
"name" : "records",
"type" : {
"type" : "array",
"items" : {
"type" : "record",
"name" : "inner_record",
"fields" : [ {
"name" : "name",
"type" : "string"
}, {
"name" : "age",
"type" : "int"
} ]
}
}
} ]
}