diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml index e04159e001..e851565994 100755 --- a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml +++ b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml @@ -218,6 +218,7 @@ src/test/resources/syslog/syslog5424/log_mix.txt src/test/resources/syslog/syslog5424/log_mix_in_error.txt src/test/resources/text/testschema + src/test/resources/xml/field_with_sub-element.xml src/test/resources/xml/people.xml src/test/resources/xml/people2.xml src/test/resources/xml/people3.xml @@ -236,6 +237,7 @@ src/test/resources/xml/people_tag_in_characters.xml src/test/resources/xml/people_with_header_and_comments.xml src/test/resources/xml/person.xml + src/test/resources/xml/person_record.xml src/test/resources/xml/testschema src/test/resources/xml/testschema2 src/test/resources/xml/testschema3 diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLReader.java b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLReader.java index 52a9701507..20ee5195c6 100644 --- a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLReader.java +++ b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLReader.java @@ -97,7 +97,10 @@ public class XMLReader extends SchemaRegistryService implements RecordReaderFact .description("If tags with content (e. g. content) are defined as nested records in the schema, " + "the name of the tag will be used as name for the record and the value of this property will be used as name for the field. " + "If tags with content shall be parsed together with attributes (e. g. content), " + - "they have to be defined as records. For additional information, see the section of processor usage.") + "they have to be defined as records. In such a case, the name of the tag will be used as the name for the record and " + + "the value of this property will be used as the name for the field holding the original content. The name of the attribute " + + "will be used to create a new record field, the content of which will be the value of the attribute. " + + "For more information, see the 'Additional Details...' section of the XMLReader controller service's documentation.") .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES) .required(false) @@ -136,7 +139,12 @@ public class XMLReader extends SchemaRegistryService implements RecordReaderFact @Override protected SchemaAccessStrategy getSchemaAccessStrategy(final String strategy, final SchemaRegistry schemaRegistry, final PropertyContext context) { - final RecordSourceFactory sourceFactory = (variables, contentStream) -> new XmlRecordSource(contentStream, isMultipleRecords(context, variables)); + + final RecordSourceFactory sourceFactory = (variables, contentStream) -> { + String contentFieldName = trim(context.getProperty(CONTENT_FIELD_NAME).evaluateAttributeExpressions(variables).getValue()); + contentFieldName = (contentFieldName == null) ? "value" : contentFieldName; + return new XmlRecordSource(contentStream, contentFieldName, isMultipleRecords(context, variables)); + }; final Supplier> schemaInference = () -> new XmlSchemaInference(new TimeValueInference(dateFormat, timeFormat, timestampFormat)); return SchemaInferenceUtil.getSchemaAccessStrategy(strategy, context, getLogger(), sourceFactory, schemaInference, diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLRecordReader.java b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLRecordReader.java index 2cb165b6e9..fd6d23b3e2 100644 --- a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLRecordReader.java +++ b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLRecordReader.java @@ -339,8 +339,8 @@ public class XMLRecordReader implements RecordReader { if (contentFieldName != null) { recordValues.put(contentFieldName, content.toString()); } else { - logger.debug("Found content for field that has to be parsed as record but property \"Field Name for Content\" is not set. " + - "The content will not be added to the record."); + logger.debug("Found content for a field that was supposed to be named with the value of the \"Field Name for Content\" property but " + + "the property was not set. The content was not added to the record."); } return new MapRecord(new SimpleRecordSchema(Collections.emptyList()), recordValues); @@ -486,10 +486,13 @@ public class XMLRecordReader implements RecordReader { if (field.isPresent()) { Object value = parseStringForType(content.toString(), contentFieldName, field.get().getDataType()); recordValues.put(contentFieldName, value); + } else { + logger.debug("Found content for a field that was supposed to be named with the value of the \"Field Name for Content\" property " + + "but no such field was present in the schema. The content was not added to the record."); } } else { - logger.debug("Found content for field that is defined as record but property \"Field Name for Content\" is not set. " + - "The content will not be added to record."); + logger.debug("Found content for a field that was supposed to be named with the value of the \"Field Name for Content\" property but " + + "the property was not set. The content was not added to the record."); } } diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/inference/XmlRecordSource.java b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/inference/XmlRecordSource.java index 3192e141aa..8352aed08d 100644 --- a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/inference/XmlRecordSource.java +++ b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/inference/XmlRecordSource.java @@ -35,8 +35,10 @@ import java.util.Map; public class XmlRecordSource implements RecordSource { private final XMLEventReader xmlEventReader; + private final String contentFieldName; - public XmlRecordSource(final InputStream in, final boolean ignoreWrapper) throws IOException { + public XmlRecordSource(final InputStream in, final String contentFieldName, final boolean ignoreWrapper) throws IOException { + this.contentFieldName = contentFieldName; try { final XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); @@ -125,7 +127,7 @@ public class XmlRecordSource implements RecordSource { } else { final String textContent = content.toString().trim(); if (!textContent.equals("")) { - childNodes.put("value", new XmlTextNode("value", textContent)); + childNodes.put(contentFieldName, new XmlTextNode(contentFieldName, textContent)); } return new XmlContainerNode(nodeName, childNodes); diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/resources/docs/org.apache.nifi.xml.XMLReader/additionalDetails.html b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/resources/docs/org.apache.nifi.xml.XMLReader/additionalDetails.html index b8e1dff0d8..adcb3cc550 100755 --- a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/resources/docs/org.apache.nifi.xml.XMLReader/additionalDetails.html +++ b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/resources/docs/org.apache.nifi.xml.XMLReader/additionalDetails.html @@ -286,6 +286,357 @@ for tags containing attributes and content.

+

Example: Tags with Attributes and Schema Inference

+ +

+ When the record's schema is not provided but inferred based on the data itself, providing a value for the "Field Name for Content" property + is especially important. (For detailed information on schema inference, see the "Schema Inference" section below.) + Let's focus on cases where an XML element (called <field_with_attribute> in the examples) has an XML attribute and some content and no sub-elements. + For the examples below, let's assume that a ConvertRecord processor is used, and it uses an XMLReader controller service and an XMLRecordSetWriter + controller service. The settings for XMLReader are provided separately for each example. The settings for XMLRecordSetWriter are common + for all the examples below. This way an XML to XML conversion is executed and comparing the input data with the output highlights + the schema inference behavior. The same behavior can be observed if a different Writer controller service is used. + XMLRecordSetWriter was chosen for these examples so that the input and the output are easily comparable. + The settings of the common XMLRecordSetWriter are the following: +

+ + + + + + + + + + + + + + +
Property NameProperty Value
Schema Access StrategyInherit Record Schema
Suppress Null ValuesNever Suppress
+ +

XML Attributes and Schema Inference Example 1

+ +

+ XMLReader settings: +

+ + + + + + + + + + + + + + + + + + +
Property NameProperty Value
Schema Access StrategyInfer Schema
Expect Records as Arrayfalse
Field Name for Contentnot set
+ +

+ Input: +

+ + +
+                <record>
+                    <field_with_attribute attr="attr_content">
+                        content of field
+                    </field_with_attribute>
+                </record>
+
+ +

As mentioned above, the element called "field_with_attribute" has an attribute and some content but no sub-element.

+ +

+ Output: +

+ + +
+                <record>
+                    <field_with_attribute>
+                        <attr>attr_content</attr>
+                        <value></value>
+                    </field_with_attribute>
+                </record>
+
+ +

+ In the XMLReader's settings, no value is set for the "Field Name for Content" property. In such cases the schema inference logic + adds a field named "value" to the schema. However, since "Field Name for Content" is not set, the data processing logic is instructed + not to consider the original content of the parent XML tags (<field_with_attribute> the content of which is "content of field" + in the example). So a new field named "value" appears in the schema but no value is assigned to it from the data, thus the field is empty. + The XML attribute (named "attr") is processed, a field named "attr" is added to the schema and the attribute's value ("attr_content") is assigned to it. + In a case like this, the parent field's original content is lost and a new field named "value" appears in the schema with no data assigned to it. + This is to make sure that no data is overwritten in the record if it already contains a field named "value". More on that case in Example 3 and Example 4. +

+ +

XML Attributes and Schema Inference Example 2

+ +

+ In this example, the XMLReader's "Field Name for Content" property is filled with the value "original_content". The input data is the same as + in the previous example. +

+ +

+ XMLReader settings: +

+ + + + + + + + + + + + + + + + + + +
Property NameProperty Value
Schema Access StrategyInfer Schema
Expect Records as Arrayfalse
Field Name for Contentoriginal_content
+ +

+ Input: +

+ + +
+                <record>
+                    <field_with_attribute attr="attr_content">
+                        content of field
+                    </field_with_attribute>
+                </record>
+
+ +

+ Output: +

+ + +
+                <record>
+                    <field_with_attribute>
+                        <attr>attr_content</attr>
+                        <original_content>content of field</original_content>
+                    </field_with_attribute>
+                </record>
+
+ +

+ The XMLReader's "Field Name for Content" property contains the value "original_content" (the concrete value is not important, what is important + is that a value is provided and it does not clash with the name of any sub-element in <field_with_attribute>). + This explicitly tells the XMLReader controller service to create a field named "original_content" and make the original content of + the parent XML tag the value of the field named "original_content". Adding the XML attributed named "attr" works just like in the first example. + Since the <field_with_attribute> element had no child-element with the name "original_content", no data is lost. +

+ +

XML Attributes and Schema Inference Example 3

+ +

+ In this example, XMLReader's "Field Name for Content" property is left empty. In the input data, the <field_with_attribute> element + has some content and a sub-element named <value>. +

+ +

+ XMLReader settings: +

+ + + + + + + + + + + + + + + + + + +
Property NameProperty Value
Schema Access StrategyInfer Schema
Expect Records as Arrayfalse
Field Name for Contentnot set
+ +

+ Input: +

+ + +
+                <record>
+                    <field_with_attribute attr="attr_content">
+                          content of field<value>123</value>
+                    </field_with_attribute>
+                </record>
+
+ +

+ Output: +

+ + +
+                <record>
+                    <field_with_attribute>
+                        <attr>attr_content</attr>
+                        <value>123</value>
+                    </field_with_attribute>
+                </record>
+
+ +

+ The "Field Name for Content" property is not set, and the XML element has a sub-element named "value". The name of the sub-element clashes with the + default field name added to the schema by the Schema Inference logic (see Example 1). As seen in the output data, the input XML attribute's value + is added to the record just like in the previous examples. The value of the <value> element is retained, but the content of the + <field_with_attribute> that was outside of the sub-element, is lost. +

+ +

XML Attributes and Schema Inference Example 4

+ +

+ In this example, XMLReader's "Field Name for Content" property is given the value "value". In the input data, the <field_with_attribute> element + has some content and a sub-element named <value>. The name of the sub-element clashes with the value of the "Field Name for Content" property. +

+ +

+ XMLReader settings: +

+ + + + + + + + + + + + + + + + + + +
Property NameProperty Value
Schema Access StrategyInfer Schema
Expect Records as Arrayfalse
Field Name for Contentvalue
+ +

+ Input: +

+ + +
+                <record>
+                    <field_with_attribute attr="attr_content">
+                          content of field<value>123</value>
+                    </field_with_attribute>
+                </record>
+
+ +

+ Output: +

+ + +
+                <record>
+                    <field_with_attribute>
+                        <attr>attr_content</attr>
+                        <value>content of field</value>
+                    </field_with_attribute>
+                </record>
+
+ +

+ The "Field Name for Content" property's value is "value", and the XML element has a sub-element named "value". The name of the sub-element clashes with the + value of the "Field Name for Content" property. The value of the <value> element is replaced by the content of the + <field_with_attribute> element, and the original content of the <value> element is lost. +

+ +

XML Attributes and Schema Inference Example 5

+ +

+ To avoid losing any data, the XMLReader's "Field Name for Content" property needs to be given a value that does not clash with any sub-element's name + in the input data. In this example the input data is the same as in the previous one, but the "Field Name for Content" property's value is "original_content", + a value that does not clash with any sub-element name. No data is lost in this case. +

+ +

+ XMLReader settings: +

+ + + + + + + + + + + + + + + + + + +
Property NameProperty Value
Schema Access StrategyInfer Schema
Expect Records as Arrayfalse
Field Name for Contentoriginal_content
+ +

+ Input: +

+ + +
+                <record>
+                    <field_with_attribute attr="attr_content">
+                          content of field<value>123</value>
+                    </field_with_attribute>
+                </record>
+
+ +

+ Output: +

+ + +
+                <record>
+                    <field_with_attribute>
+                        <attr>attr_content</attr>
+                        <value>123</value>
+                        <original_content>content of field</original_content>
+                    </field_with_attribute>
+                </record>
+
+ +

+ It can be seen in the output data, that the attribute has been added to the <field_with_attribute> element as a sub-element, + the <value> retained its value, and the original content of the <field_with_attribute> element has been added as a sub-element + named "original_content". This is because a value was chosen for the "Field Name for Content" property that does not clash with any of + the existing sub-elements of the input XML element (<field_with_attribute>). No data is lost. +

+

Example: Array of records

diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestInferXmlSchema.java b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestInferXmlSchema.java index b5bdd76025..2a4cd14f10 100644 --- a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestInferXmlSchema.java +++ b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestInferXmlSchema.java @@ -93,7 +93,8 @@ public class TestInferXmlSchema { @Test public void testStringFieldWithAttributes() throws IOException { - final RecordSchema schema = inferSchema("src/test/resources/xml/TextNodeWithAttribute.xml", true); + final String contentFieldName = "contentfield"; + final RecordSchema schema = inferSchema("src/test/resources/xml/TextNodeWithAttribute.xml", contentFieldName, true); assertEquals(3, schema.getFieldCount()); @@ -106,12 +107,16 @@ public class TestInferXmlSchema { final RecordSchema childSchema = ((RecordDataType) softwareDataType).getChildSchema(); assertSame(RecordFieldType.BOOLEAN, childSchema.getDataType("favorite").get().getFieldType()); - assertSame(RecordFieldType.STRING, childSchema.getDataType("value").get().getFieldType()); + assertSame(RecordFieldType.STRING, childSchema.getDataType(contentFieldName).get().getFieldType()); } private RecordSchema inferSchema(final String filename, final boolean ignoreWrapper) throws IOException { + return inferSchema(filename, "contentfield", ignoreWrapper); + } + + private RecordSchema inferSchema(final String filename, final String contentFieldName, final boolean ignoreWrapper) throws IOException { final File file = new File(filename); - final RecordSourceFactory xmlSourceFactory = (var, in) -> new XmlRecordSource(in, ignoreWrapper); + final RecordSourceFactory xmlSourceFactory = (var, in) -> new XmlRecordSource(in, contentFieldName, ignoreWrapper); final SchemaInferenceEngine schemaInference = new XmlSchemaInference(timeValueInference); final InferSchemaAccessStrategy inferStrategy = new InferSchemaAccessStrategy<>(xmlSourceFactory, schemaInference, Mockito.mock(ComponentLog.class)); diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestXMLReader.java b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestXMLReader.java index 5de7eac402..6904a6f599 100644 --- a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestXMLReader.java +++ b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestXMLReader.java @@ -17,8 +17,10 @@ package org.apache.nifi.xml; +import org.apache.nifi.components.PropertyDescriptor; import org.apache.nifi.reporting.InitializationException; import org.apache.nifi.schema.access.SchemaAccessUtils; +import org.apache.nifi.schema.inference.SchemaInferenceUtil; import org.apache.nifi.util.MockFlowFile; import org.apache.nifi.util.TestRunner; import org.apache.nifi.util.TestRunners; @@ -31,43 +33,48 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; import static junit.framework.TestCase.assertEquals; public class TestXMLReader { - private XMLReader reader; - private final String ATTRIBUTE_PREFIX = "attribute_prefix"; private final String CONTENT_NAME = "content_field"; private final String EVALUATE_IS_ARRAY = "xml.stream.is.array"; - public TestRunner setup(String filePath) throws InitializationException, IOException { - + private TestRunner setup(Map xmlReaderProperties) throws InitializationException { TestRunner runner = TestRunners.newTestRunner(TestXMLReaderProcessor.class); - reader = new XMLReader(); + XMLReader reader = new XMLReader(); + runner.addControllerService("xml_reader", reader); runner.setProperty(TestXMLReaderProcessor.XML_READER, "xml_reader"); - final String outputSchemaText = new String(Files.readAllBytes(Paths.get(filePath))); - runner.setProperty(reader, SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaAccessUtils.SCHEMA_TEXT_PROPERTY); - runner.setProperty(reader, SchemaAccessUtils.SCHEMA_TEXT, outputSchemaText); + for (Map.Entry entry : xmlReaderProperties.entrySet()) { + runner.setProperty(reader, entry.getKey(), entry.getValue()); + } + runner.enableControllerService(reader); return runner; } @Test - public void testRecordFormat() throws IOException, InitializationException { - TestRunner runner = setup("src/test/resources/xml/testschema"); + public void testRecordFormatDeterminedBasedOnAttribute() throws IOException, InitializationException { + String outputSchemaPath = "src/test/resources/xml/testschema"; + String outputSchemaText = new String(Files.readAllBytes(Paths.get(outputSchemaPath))); - runner.setProperty(reader, XMLReader.RECORD_FORMAT, XMLReader.RECORD_EVALUATE); + Map xmlReaderProperties = new HashMap<>(); + xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue()); + xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT, outputSchemaText); + xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_EVALUATE.getValue()); + TestRunner runner = setup(xmlReaderProperties); - runner.enableControllerService(reader); - - InputStream is = new FileInputStream("src/test/resources/xml/people.xml"); - runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, "true")); - runner.run(); + try (InputStream is = new FileInputStream("src/test/resources/xml/people.xml")) { + runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, "true")); + runner.run(); + } List flowFile = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS); List records = Arrays.asList((new String(runner.getContentAsByteArray(flowFile.get(0)))).split("\n")); @@ -76,16 +83,20 @@ public class TestXMLReader { } @Test - public void testRecordFormat2() throws IOException, InitializationException { - TestRunner runner = setup("src/test/resources/xml/testschema"); + public void testRecordFormatArray() throws IOException, InitializationException { + String outputSchemaPath = "src/test/resources/xml/testschema"; + String outputSchemaText = new String(Files.readAllBytes(Paths.get(outputSchemaPath))); - runner.setProperty(reader, XMLReader.RECORD_FORMAT, XMLReader.RECORD_ARRAY); + Map xmlReaderProperties = new HashMap<>(); + xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue()); + xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT, outputSchemaText); + xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_ARRAY.getValue()); + TestRunner runner = setup(xmlReaderProperties); - runner.enableControllerService(reader); - - InputStream is = new FileInputStream("src/test/resources/xml/people.xml"); - runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, "true")); - runner.run(); + try (InputStream is = new FileInputStream("src/test/resources/xml/people.xml")) { + runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, "true")); + runner.run(); + } List flowFile = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS); List records = Arrays.asList((new String(runner.getContentAsByteArray(flowFile.get(0)))).split("\n")); @@ -94,16 +105,20 @@ public class TestXMLReader { } @Test - public void testRecordFormat3() throws IOException, InitializationException { - TestRunner runner = setup("src/test/resources/xml/testschema"); + public void testRecordFormatNotArray() throws IOException, InitializationException { + String outputSchemaPath = "src/test/resources/xml/testschema"; + String outputSchemaText = new String(Files.readAllBytes(Paths.get(outputSchemaPath))); - runner.setProperty(reader, XMLReader.RECORD_FORMAT, XMLReader.RECORD_SINGLE); + Map xmlReaderProperties = new HashMap<>(); + xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue()); + xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT, outputSchemaText); + xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_SINGLE.getValue()); + TestRunner runner = setup(xmlReaderProperties); - runner.enableControllerService(reader); - - InputStream is = new FileInputStream("src/test/resources/xml/person.xml"); - runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, "true")); - runner.run(); + try (InputStream is = new FileInputStream("src/test/resources/xml/person.xml")) { + runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, "true")); + runner.run(); + } List flowFile = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS); List records = Arrays.asList(new String(runner.getContentAsByteArray(flowFile.get(0))).split("\n")); @@ -113,16 +128,20 @@ public class TestXMLReader { @Test public void testAttributePrefix() throws IOException, InitializationException { - TestRunner runner = setup("src/test/resources/xml/testschema"); + String outputSchemaPath = "src/test/resources/xml/testschema"; + String outputSchemaText = new String(Files.readAllBytes(Paths.get(outputSchemaPath))); - runner.setProperty(reader, XMLReader.ATTRIBUTE_PREFIX, "${" + ATTRIBUTE_PREFIX + "}"); - runner.setProperty(reader, XMLReader.RECORD_FORMAT, XMLReader.RECORD_ARRAY); + Map xmlReaderProperties = new HashMap<>(); + xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue()); + xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT, outputSchemaText); + xmlReaderProperties.put(XMLReader.ATTRIBUTE_PREFIX, "${" + ATTRIBUTE_PREFIX + "}"); + xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_ARRAY.getValue()); + TestRunner runner = setup(xmlReaderProperties); - runner.enableControllerService(reader); - - InputStream is = new FileInputStream("src/test/resources/xml/people.xml"); - runner.enqueue(is, Collections.singletonMap(ATTRIBUTE_PREFIX, "ATTR_")); - runner.run(); + try (InputStream is = new FileInputStream("src/test/resources/xml/people.xml")) { + runner.enqueue(is, Collections.singletonMap(ATTRIBUTE_PREFIX, "ATTR_")); + runner.run(); + } List flowFile = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS); List records = Arrays.asList(new String(runner.getContentAsByteArray(flowFile.get(0))).split("\n")); @@ -136,16 +155,20 @@ public class TestXMLReader { @Test public void testContentField() throws IOException, InitializationException { - TestRunner runner = setup("src/test/resources/xml/testschema2"); + String outputSchemaPath = "src/test/resources/xml/testschema2"; + String outputSchemaText = new String(Files.readAllBytes(Paths.get(outputSchemaPath))); - runner.setProperty(reader, XMLReader.CONTENT_FIELD_NAME, "${" + CONTENT_NAME + "}"); - runner.setProperty(reader, XMLReader.RECORD_FORMAT, XMLReader.RECORD_ARRAY); + Map xmlReaderProperties = new HashMap<>(); + xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue()); + xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT, outputSchemaText); + xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, "${" + CONTENT_NAME + "}"); + xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_ARRAY.getValue()); + TestRunner runner = setup(xmlReaderProperties); - runner.enableControllerService(reader); - - InputStream is = new FileInputStream("src/test/resources/xml/people_tag_in_characters.xml"); - runner.enqueue(is, Collections.singletonMap(CONTENT_NAME, "CONTENT")); - runner.run(); + try (InputStream is = new FileInputStream("src/test/resources/xml/people_tag_in_characters.xml")) { + runner.enqueue(is, Collections.singletonMap(CONTENT_NAME, "CONTENT")); + runner.run(); + } List flowFile = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS); List records = Arrays.asList(new String(runner.getContentAsByteArray(flowFile.get(0))).split("\n")); @@ -157,4 +180,103 @@ public class TestXMLReader { assertEquals("MapRecord[{ID=P4, NAME=MapRecord[{CONTENT=Elenora Scrivens, ATTR=attr content, INNER=inner content}], AGE=16}]", records.get(3)); assertEquals("MapRecord[{ID=P5, NAME=MapRecord[{INNER=inner content}]}]", records.get(4)); } + + @Test + public void testInferSchema() throws InitializationException, IOException { + String expectedContent = "MapRecord[{software=MapRecord[{" + CONTENT_NAME + "=Apache NiFi, favorite=true}], num=123, name=John Doe}]"; + + Map xmlReaderProperties = new HashMap<>(); + xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaInferenceUtil.INFER_SCHEMA.getValue()); + xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_SINGLE.getValue()); + xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, CONTENT_NAME); + TestRunner runner = setup(xmlReaderProperties); + + try (InputStream is = new FileInputStream("src/test/resources/xml/person_record.xml")) { + runner.enqueue(is); + runner.run(); + } + + MockFlowFile out = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0); + String actualContent = out.getContent(); + assertEquals(expectedContent, actualContent); + } + + @Test + public void testInferSchemaContentFieldNameNotSet() throws InitializationException, IOException { + String expectedContent = "MapRecord[{software=MapRecord[{favorite=true}], num=123, name=John Doe}]"; + + Map xmlReaderProperties = new HashMap<>(); + xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaInferenceUtil.INFER_SCHEMA.getValue()); + xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_SINGLE.getValue()); + TestRunner runner = setup(xmlReaderProperties); + + try (InputStream is = new FileInputStream("src/test/resources/xml/person_record.xml")) { + runner.enqueue(is); + runner.run(); + } + + MockFlowFile out = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0); + String actualContent = out.getContent(); + assertEquals(expectedContent, actualContent); + } + + @Test + public void testInferSchemaContentFieldNameNotSetSubElementExists() throws InitializationException, IOException { + String expectedContent = "MapRecord[{field_with_attribute=MapRecord[{attr=attr_content, value=123}]}]"; + + Map xmlReaderProperties = new HashMap<>(); + xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaInferenceUtil.INFER_SCHEMA.getValue()); + xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_SINGLE.getValue()); + TestRunner runner = setup(xmlReaderProperties); + + try (InputStream is = new FileInputStream("src/test/resources/xml/field_with_sub-element.xml")) { + runner.enqueue(is); + runner.run(); + } + + MockFlowFile out = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0); + String actualContent = out.getContent(); + assertEquals(expectedContent, actualContent); + } + + @Test + public void testInferSchemaContentFieldNameSetSubElementExistsNameClash() throws InitializationException, IOException { + String expectedContent = "MapRecord[{field_with_attribute=MapRecord[{attr=attr_content, value=content of field}]}]"; + + Map xmlReaderProperties = new HashMap<>(); + xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaInferenceUtil.INFER_SCHEMA.getValue()); + xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_SINGLE.getValue()); + xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, "value"); + TestRunner runner = setup(xmlReaderProperties); + + try (InputStream is = new FileInputStream("src/test/resources/xml/field_with_sub-element.xml")) { + runner.enqueue(is); + runner.run(); + } + + MockFlowFile out = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0); + String actualContent = out.getContent(); + assertEquals(expectedContent, actualContent); + } + + @Test + public void testInferSchemaContentFieldNameSetSubElementExistsNoNameClash() throws InitializationException, IOException { + String expectedContent = "MapRecord[{field_with_attribute=MapRecord[{" +CONTENT_NAME + "=content of field, " + + "attr=attr_content, value=123}]}]"; + + Map xmlReaderProperties = new HashMap<>(); + xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaInferenceUtil.INFER_SCHEMA.getValue()); + xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_SINGLE.getValue()); + xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, CONTENT_NAME); + TestRunner runner = setup(xmlReaderProperties); + + try (InputStream is = new FileInputStream("src/test/resources/xml/field_with_sub-element.xml")) { + runner.enqueue(is); + runner.run(); + } + + MockFlowFile out = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0); + String actualContent = out.getContent(); + assertEquals(expectedContent, actualContent); + } } diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/field_with_sub-element.xml b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/field_with_sub-element.xml new file mode 100644 index 0000000000..2c9146119d --- /dev/null +++ b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/field_with_sub-element.xml @@ -0,0 +1,4 @@ + + content of field123 + + \ No newline at end of file diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/person_record.xml b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/person_record.xml new file mode 100644 index 0000000000..08b39093e0 --- /dev/null +++ b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/person_record.xml @@ -0,0 +1,5 @@ + + 123 + John Doe + Apache NiFi + \ No newline at end of file