NIFI-9832: Fix disappearing XML element content when the element has attribute (#5896)

- NIFI-9832: Additional test cases for XMLReader
This commit is contained in:
Peter Gyori 2022-04-12 17:41:50 +02:00 committed by GitHub
parent 27e78c6f0c
commit 68c6722f76
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 561 additions and 59 deletions

View File

@ -218,6 +218,7 @@
<exclude>src/test/resources/syslog/syslog5424/log_mix.txt</exclude> <exclude>src/test/resources/syslog/syslog5424/log_mix.txt</exclude>
<exclude>src/test/resources/syslog/syslog5424/log_mix_in_error.txt</exclude> <exclude>src/test/resources/syslog/syslog5424/log_mix_in_error.txt</exclude>
<exclude>src/test/resources/text/testschema</exclude> <exclude>src/test/resources/text/testschema</exclude>
<exclude>src/test/resources/xml/field_with_sub-element.xml</exclude>
<exclude>src/test/resources/xml/people.xml</exclude> <exclude>src/test/resources/xml/people.xml</exclude>
<exclude>src/test/resources/xml/people2.xml</exclude> <exclude>src/test/resources/xml/people2.xml</exclude>
<exclude>src/test/resources/xml/people3.xml</exclude> <exclude>src/test/resources/xml/people3.xml</exclude>
@ -236,6 +237,7 @@
<exclude>src/test/resources/xml/people_tag_in_characters.xml</exclude> <exclude>src/test/resources/xml/people_tag_in_characters.xml</exclude>
<exclude>src/test/resources/xml/people_with_header_and_comments.xml</exclude> <exclude>src/test/resources/xml/people_with_header_and_comments.xml</exclude>
<exclude>src/test/resources/xml/person.xml</exclude> <exclude>src/test/resources/xml/person.xml</exclude>
<exclude>src/test/resources/xml/person_record.xml</exclude>
<exclude>src/test/resources/xml/testschema</exclude> <exclude>src/test/resources/xml/testschema</exclude>
<exclude>src/test/resources/xml/testschema2</exclude> <exclude>src/test/resources/xml/testschema2</exclude>
<exclude>src/test/resources/xml/testschema3</exclude> <exclude>src/test/resources/xml/testschema3</exclude>

View File

@ -97,7 +97,10 @@ public class XMLReader extends SchemaRegistryService implements RecordReaderFact
.description("If tags with content (e. g. <field>content</field>) are defined as nested records in the schema, " + .description("If tags with content (e. g. <field>content</field>) are defined as nested records in the schema, " +
"the name of the tag will be used as name for the record and the value of this property will be used as name for the field. " + "the name of the tag will be used as name for the record and the value of this property will be used as name for the field. " +
"If tags with content shall be parsed together with attributes (e. g. <field attribute=\"123\">content</field>), " + "If tags with content shall be parsed together with attributes (e. g. <field attribute=\"123\">content</field>), " +
"they have to be defined as records. For additional information, see the section of processor usage.") "they have to be defined as records. In such a case, the name of the tag will be used as the name for the record and " +
"the value of this property will be used as the name for the field holding the original content. The name of the attribute " +
"will be used to create a new record field, the content of which will be the value of the attribute. " +
"For more information, see the 'Additional Details...' section of the XMLReader controller service's documentation.")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES) .expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.required(false) .required(false)
@ -136,7 +139,12 @@ public class XMLReader extends SchemaRegistryService implements RecordReaderFact
@Override @Override
protected SchemaAccessStrategy getSchemaAccessStrategy(final String strategy, final SchemaRegistry schemaRegistry, final PropertyContext context) { protected SchemaAccessStrategy getSchemaAccessStrategy(final String strategy, final SchemaRegistry schemaRegistry, final PropertyContext context) {
final RecordSourceFactory<XmlNode> sourceFactory = (variables, contentStream) -> new XmlRecordSource(contentStream, isMultipleRecords(context, variables));
final RecordSourceFactory<XmlNode> sourceFactory = (variables, contentStream) -> {
String contentFieldName = trim(context.getProperty(CONTENT_FIELD_NAME).evaluateAttributeExpressions(variables).getValue());
contentFieldName = (contentFieldName == null) ? "value" : contentFieldName;
return new XmlRecordSource(contentStream, contentFieldName, isMultipleRecords(context, variables));
};
final Supplier<SchemaInferenceEngine<XmlNode>> schemaInference = () -> new XmlSchemaInference(new TimeValueInference(dateFormat, timeFormat, timestampFormat)); final Supplier<SchemaInferenceEngine<XmlNode>> schemaInference = () -> new XmlSchemaInference(new TimeValueInference(dateFormat, timeFormat, timestampFormat));
return SchemaInferenceUtil.getSchemaAccessStrategy(strategy, context, getLogger(), sourceFactory, schemaInference, return SchemaInferenceUtil.getSchemaAccessStrategy(strategy, context, getLogger(), sourceFactory, schemaInference,

View File

@ -339,8 +339,8 @@ public class XMLRecordReader implements RecordReader {
if (contentFieldName != null) { if (contentFieldName != null) {
recordValues.put(contentFieldName, content.toString()); recordValues.put(contentFieldName, content.toString());
} else { } else {
logger.debug("Found content for field that has to be parsed as record but property \"Field Name for Content\" is not set. " + logger.debug("Found content for a field that was supposed to be named with the value of the \"Field Name for Content\" property but " +
"The content will not be added to the record."); "the property was not set. The content was not added to the record.");
} }
return new MapRecord(new SimpleRecordSchema(Collections.emptyList()), recordValues); return new MapRecord(new SimpleRecordSchema(Collections.emptyList()), recordValues);
@ -486,10 +486,13 @@ public class XMLRecordReader implements RecordReader {
if (field.isPresent()) { if (field.isPresent()) {
Object value = parseStringForType(content.toString(), contentFieldName, field.get().getDataType()); Object value = parseStringForType(content.toString(), contentFieldName, field.get().getDataType());
recordValues.put(contentFieldName, value); recordValues.put(contentFieldName, value);
} else {
logger.debug("Found content for a field that was supposed to be named with the value of the \"Field Name for Content\" property " +
"but no such field was present in the schema. The content was not added to the record.");
} }
} else { } else {
logger.debug("Found content for field that is defined as record but property \"Field Name for Content\" is not set. " + logger.debug("Found content for a field that was supposed to be named with the value of the \"Field Name for Content\" property but " +
"The content will not be added to record."); "the property was not set. The content was not added to the record.");
} }
} }

View File

@ -35,8 +35,10 @@ import java.util.Map;
public class XmlRecordSource implements RecordSource<XmlNode> { public class XmlRecordSource implements RecordSource<XmlNode> {
private final XMLEventReader xmlEventReader; private final XMLEventReader xmlEventReader;
private final String contentFieldName;
public XmlRecordSource(final InputStream in, final boolean ignoreWrapper) throws IOException { public XmlRecordSource(final InputStream in, final String contentFieldName, final boolean ignoreWrapper) throws IOException {
this.contentFieldName = contentFieldName;
try { try {
final XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); final XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
@ -125,7 +127,7 @@ public class XmlRecordSource implements RecordSource<XmlNode> {
} else { } else {
final String textContent = content.toString().trim(); final String textContent = content.toString().trim();
if (!textContent.equals("")) { if (!textContent.equals("")) {
childNodes.put("value", new XmlTextNode("value", textContent)); childNodes.put(contentFieldName, new XmlTextNode(contentFieldName, textContent));
} }
return new XmlContainerNode(nodeName, childNodes); return new XmlContainerNode(nodeName, childNodes);

View File

@ -286,6 +286,357 @@
for tags containing attributes and content. for tags containing attributes and content.
</p> </p>
<h2>Example: Tags with Attributes and Schema Inference</h2>
<p>
When the record's schema is not provided but inferred based on the data itself, providing a value for the "Field Name for Content" property
is especially important. (For detailed information on schema inference, see the "Schema Inference" section below.)
Let's focus on cases where an XML element (called <code>&lt;field_with_attribute&gt;</code> in the examples) has an XML attribute and some content and no sub-elements.
For the examples below, let's assume that a ConvertRecord processor is used, and it uses an XMLReader controller service and an XMLRecordSetWriter
controller service. The settings for XMLReader are provided separately for each example. The settings for XMLRecordSetWriter are common
for all the examples below. This way an XML to XML conversion is executed and comparing the input data with the output highlights
the schema inference behavior. The same behavior can be observed if a different Writer controller service is used.
XMLRecordSetWriter was chosen for these examples so that the input and the output are easily comparable.
The settings of the common XMLRecordSetWriter are the following:
</p>
<table>
<tr>
<th>Property Name</th>
<th>Property Value</th>
</tr>
<tr>
<td>Schema Access Strategy</td>
<td><code>Inherit Record Schema</code></td>
</tr>
<tr>
<td>Suppress Null Values</td>
<td><code>Never Suppress</code></td>
</tr>
</table>
<h3>XML Attributes and Schema Inference Example 1</h3>
<p>
XMLReader settings:
</p>
<table>
<tr>
<th>Property Name</th>
<th>Property Value</th>
</tr>
<tr>
<td>Schema Access Strategy</td>
<td><code>Infer Schema</code></td>
</tr>
<tr>
<td>Expect Records as Array</td>
<td><code>false</code></td>
</tr>
<tr>
<td>Field Name for Content</td>
<td>not set</td>
</tr>
</table>
<p>
Input:
</p>
<code>
<pre>
&lt;record&gt;
&lt;field_with_attribute attr="attr_content"&gt;
content of field
&lt;/field_with_attribute&gt;
&lt;/record&gt;</pre>
</code>
<p>As mentioned above, the element called "field_with_attribute" has an attribute and some content but no sub-element.</p>
<p>
Output:
</p>
<code>
<pre>
&lt;record&gt;
&lt;field_with_attribute&gt;
&lt;attr&gt;attr_content&lt;/attr&gt;
&lt;value&gt;&lt;/value&gt;
&lt;/field_with_attribute&gt;
&lt;/record&gt;</pre>
</code>
<p>
In the XMLReader's settings, no value is set for the "Field Name for Content" property. In such cases the schema inference logic
adds a field named "value" to the schema. However, since "Field Name for Content" is not set, the data processing logic is instructed
not to consider the original content of the parent XML tags (<code>&lt;field_with_attribute&gt;</code> the content of which is "content of field"
in the example). So a new field named "value" appears in the schema but no value is assigned to it from the data, thus the field is empty.
The XML attribute (named "attr") is processed, a field named "attr" is added to the schema and the attribute's value ("attr_content") is assigned to it.
In a case like this, the parent field's original content is lost and a new field named "value" appears in the schema with no data assigned to it.
This is to make sure that no data is overwritten in the record if it already contains a field named "value". More on that case in Example 3 and Example 4.
</p>
<h3>XML Attributes and Schema Inference Example 2</h3>
<p>
In this example, the XMLReader's "Field Name for Content" property is filled with the value "original_content". The input data is the same as
in the previous example.
</p>
<p>
XMLReader settings:
</p>
<table>
<tr>
<th>Property Name</th>
<th>Property Value</th>
</tr>
<tr>
<td>Schema Access Strategy</td>
<td><code>Infer Schema</code></td>
</tr>
<tr>
<td>Expect Records as Array</td>
<td><code>false</code></td>
</tr>
<tr>
<td>Field Name for Content</td>
<td><code>original_content</code></td>
</tr>
</table>
<p>
Input:
</p>
<code>
<pre>
&lt;record&gt;
&lt;field_with_attribute attr="attr_content"&gt;
content of field
&lt;/field_with_attribute&gt;
&lt;/record&gt;</pre>
</code>
<p>
Output:
</p>
<code>
<pre>
&lt;record&gt;
&lt;field_with_attribute&gt;
&lt;attr&gt;attr_content&lt;/attr&gt;
&lt;original_content&gt;content of field&lt;/original_content&gt;
&lt;/field_with_attribute&gt;
&lt;/record&gt;</pre>
</code>
<p>
The XMLReader's "Field Name for Content" property contains the value "original_content" (the concrete value is not important, what is important
is that a value is provided and it does not clash with the name of any sub-element in <code>&lt;field_with_attribute&gt;</code>).
This explicitly tells the XMLReader controller service to create a field named "original_content" and make the original content of
the parent XML tag the value of the field named "original_content". Adding the XML attributed named "attr" works just like in the first example.
Since the <code>&lt;field_with_attribute&gt;</code> element had no child-element with the name "original_content", no data is lost.
</p>
<h3>XML Attributes and Schema Inference Example 3</h3>
<p>
In this example, XMLReader's "Field Name for Content" property is left empty. In the input data, the <code>&lt;field_with_attribute&gt;</code> element
has some content and a sub-element named <code>&lt;value&gt;</code>.
</p>
<p>
XMLReader settings:
</p>
<table>
<tr>
<th>Property Name</th>
<th>Property Value</th>
</tr>
<tr>
<td>Schema Access Strategy</td>
<td><code>Infer Schema</code></td>
</tr>
<tr>
<td>Expect Records as Array</td>
<td><code>false</code></td>
</tr>
<tr>
<td>Field Name for Content</td>
<td>not set</td>
</tr>
</table>
<p>
Input:
</p>
<code>
<pre>
&lt;record&gt;
&lt;field_with_attribute attr="attr_content"&gt;
content of field&lt;value&gt;123&lt;/value&gt;
&lt;/field_with_attribute&gt;
&lt;/record&gt;</pre>
</code>
<p>
Output:
</p>
<code>
<pre>
&lt;record&gt;
&lt;field_with_attribute&gt;
&lt;attr&gt;attr_content&lt;/attr&gt;
&lt;value&gt;123&lt;/value&gt;
&lt;/field_with_attribute&gt;
&lt;/record&gt;</pre>
</code>
<p>
The "Field Name for Content" property is not set, and the XML element has a sub-element named "value". The name of the sub-element clashes with the
default field name added to the schema by the Schema Inference logic (see Example 1). As seen in the output data, the input XML attribute's value
is added to the record just like in the previous examples. The value of the <code>&lt;value&gt;</code> element is retained, but the content of the
<code>&lt;field_with_attribute&gt;</code> that was outside of the sub-element, is lost.
</p>
<h3>XML Attributes and Schema Inference Example 4</h3>
<p>
In this example, XMLReader's "Field Name for Content" property is given the value "value". In the input data, the <code>&lt;field_with_attribute&gt;</code> element
has some content and a sub-element named <code>&lt;value&gt;</code>. The name of the sub-element clashes with the value of the "Field Name for Content" property.
</p>
<p>
XMLReader settings:
</p>
<table>
<tr>
<th>Property Name</th>
<th>Property Value</th>
</tr>
<tr>
<td>Schema Access Strategy</td>
<td><code>Infer Schema</code></td>
</tr>
<tr>
<td>Expect Records as Array</td>
<td><code>false</code></td>
</tr>
<tr>
<td>Field Name for Content</td>
<td><code>value</code></td>
</tr>
</table>
<p>
Input:
</p>
<code>
<pre>
&lt;record&gt;
&lt;field_with_attribute attr="attr_content"&gt;
content of field&lt;value&gt;123&lt;/value&gt;
&lt;/field_with_attribute&gt;
&lt;/record&gt;</pre>
</code>
<p>
Output:
</p>
<code>
<pre>
&lt;record&gt;
&lt;field_with_attribute&gt;
&lt;attr&gt;attr_content&lt;/attr&gt;
&lt;value&gt;content of field&lt;/value&gt;
&lt;/field_with_attribute&gt;
&lt;/record&gt;</pre>
</code>
<p>
The "Field Name for Content" property's value is "value", and the XML element has a sub-element named "value". The name of the sub-element clashes with the
value of the "Field Name for Content" property. The value of the <code>&lt;value&gt;</code> element is replaced by the content of the
<code>&lt;field_with_attribute&gt;</code> element, and the original content of the <code>&lt;value&gt;</code> element is lost.
</p>
<h3>XML Attributes and Schema Inference Example 5</h3>
<p>
To avoid losing any data, the XMLReader's "Field Name for Content" property needs to be given a value that does not clash with any sub-element's name
in the input data. In this example the input data is the same as in the previous one, but the "Field Name for Content" property's value is "original_content",
a value that does not clash with any sub-element name. No data is lost in this case.
</p>
<p>
XMLReader settings:
</p>
<table>
<tr>
<th>Property Name</th>
<th>Property Value</th>
</tr>
<tr>
<td>Schema Access Strategy</td>
<td><code>Infer Schema</code></td>
</tr>
<tr>
<td>Expect Records as Array</td>
<td><code>false</code></td>
</tr>
<tr>
<td>Field Name for Content</td>
<td><code>original_content</code></td>
</tr>
</table>
<p>
Input:
</p>
<code>
<pre>
&lt;record&gt;
&lt;field_with_attribute attr="attr_content"&gt;
content of field&lt;value&gt;123&lt;/value&gt;
&lt;/field_with_attribute&gt;
&lt;/record&gt;</pre>
</code>
<p>
Output:
</p>
<code>
<pre>
&lt;record&gt;
&lt;field_with_attribute&gt;
&lt;attr&gt;attr_content&lt;/attr&gt;
&lt;value&gt;123&lt;/value&gt;
&lt;original_content&gt;content of field&lt;/original_content&gt;
&lt;/field_with_attribute&gt;
&lt;/record&gt;</pre>
</code>
<p>
It can be seen in the output data, that the attribute has been added to the <code>&lt;field_with_attribute&gt;</code> element as a sub-element,
the <code>&lt;value&gt;</code> retained its value, and the original content of the <code>&lt;field_with_attribute&gt;</code> element has been added as a sub-element
named "original_content". This is because a value was chosen for the "Field Name for Content" property that does not clash with any of
the existing sub-elements of the input XML element (<code>&lt;field_with_attribute&gt;</code>). No data is lost.
</p>
<h2>Example: Array of records</h2> <h2>Example: Array of records</h2>
<p> <p>

View File

@ -93,7 +93,8 @@ public class TestInferXmlSchema {
@Test @Test
public void testStringFieldWithAttributes() throws IOException { public void testStringFieldWithAttributes() throws IOException {
final RecordSchema schema = inferSchema("src/test/resources/xml/TextNodeWithAttribute.xml", true); final String contentFieldName = "contentfield";
final RecordSchema schema = inferSchema("src/test/resources/xml/TextNodeWithAttribute.xml", contentFieldName, true);
assertEquals(3, schema.getFieldCount()); assertEquals(3, schema.getFieldCount());
@ -106,12 +107,16 @@ public class TestInferXmlSchema {
final RecordSchema childSchema = ((RecordDataType) softwareDataType).getChildSchema(); final RecordSchema childSchema = ((RecordDataType) softwareDataType).getChildSchema();
assertSame(RecordFieldType.BOOLEAN, childSchema.getDataType("favorite").get().getFieldType()); assertSame(RecordFieldType.BOOLEAN, childSchema.getDataType("favorite").get().getFieldType());
assertSame(RecordFieldType.STRING, childSchema.getDataType("value").get().getFieldType()); assertSame(RecordFieldType.STRING, childSchema.getDataType(contentFieldName).get().getFieldType());
} }
private RecordSchema inferSchema(final String filename, final boolean ignoreWrapper) throws IOException { private RecordSchema inferSchema(final String filename, final boolean ignoreWrapper) throws IOException {
return inferSchema(filename, "contentfield", ignoreWrapper);
}
private RecordSchema inferSchema(final String filename, final String contentFieldName, final boolean ignoreWrapper) throws IOException {
final File file = new File(filename); final File file = new File(filename);
final RecordSourceFactory<XmlNode> xmlSourceFactory = (var, in) -> new XmlRecordSource(in, ignoreWrapper); final RecordSourceFactory<XmlNode> xmlSourceFactory = (var, in) -> new XmlRecordSource(in, contentFieldName, ignoreWrapper);
final SchemaInferenceEngine<XmlNode> schemaInference = new XmlSchemaInference(timeValueInference); final SchemaInferenceEngine<XmlNode> schemaInference = new XmlSchemaInference(timeValueInference);
final InferSchemaAccessStrategy<XmlNode> inferStrategy = new InferSchemaAccessStrategy<>(xmlSourceFactory, schemaInference, Mockito.mock(ComponentLog.class)); final InferSchemaAccessStrategy<XmlNode> inferStrategy = new InferSchemaAccessStrategy<>(xmlSourceFactory, schemaInference, Mockito.mock(ComponentLog.class));

View File

@ -17,8 +17,10 @@
package org.apache.nifi.xml; package org.apache.nifi.xml;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.reporting.InitializationException; import org.apache.nifi.reporting.InitializationException;
import org.apache.nifi.schema.access.SchemaAccessUtils; import org.apache.nifi.schema.access.SchemaAccessUtils;
import org.apache.nifi.schema.inference.SchemaInferenceUtil;
import org.apache.nifi.util.MockFlowFile; import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner; import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners; import org.apache.nifi.util.TestRunners;
@ -31,43 +33,48 @@ import java.nio.file.Files;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import static junit.framework.TestCase.assertEquals; import static junit.framework.TestCase.assertEquals;
public class TestXMLReader { public class TestXMLReader {
private XMLReader reader;
private final String ATTRIBUTE_PREFIX = "attribute_prefix"; private final String ATTRIBUTE_PREFIX = "attribute_prefix";
private final String CONTENT_NAME = "content_field"; private final String CONTENT_NAME = "content_field";
private final String EVALUATE_IS_ARRAY = "xml.stream.is.array"; private final String EVALUATE_IS_ARRAY = "xml.stream.is.array";
public TestRunner setup(String filePath) throws InitializationException, IOException { private TestRunner setup(Map<PropertyDescriptor, String> xmlReaderProperties) throws InitializationException {
TestRunner runner = TestRunners.newTestRunner(TestXMLReaderProcessor.class); TestRunner runner = TestRunners.newTestRunner(TestXMLReaderProcessor.class);
reader = new XMLReader(); XMLReader reader = new XMLReader();
runner.addControllerService("xml_reader", reader); runner.addControllerService("xml_reader", reader);
runner.setProperty(TestXMLReaderProcessor.XML_READER, "xml_reader"); runner.setProperty(TestXMLReaderProcessor.XML_READER, "xml_reader");
final String outputSchemaText = new String(Files.readAllBytes(Paths.get(filePath))); for (Map.Entry<PropertyDescriptor, String> entry : xmlReaderProperties.entrySet()) {
runner.setProperty(reader, SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaAccessUtils.SCHEMA_TEXT_PROPERTY); runner.setProperty(reader, entry.getKey(), entry.getValue());
runner.setProperty(reader, SchemaAccessUtils.SCHEMA_TEXT, outputSchemaText); }
runner.enableControllerService(reader);
return runner; return runner;
} }
@Test @Test
public void testRecordFormat() throws IOException, InitializationException { public void testRecordFormatDeterminedBasedOnAttribute() throws IOException, InitializationException {
TestRunner runner = setup("src/test/resources/xml/testschema"); String outputSchemaPath = "src/test/resources/xml/testschema";
String outputSchemaText = new String(Files.readAllBytes(Paths.get(outputSchemaPath)));
runner.setProperty(reader, XMLReader.RECORD_FORMAT, XMLReader.RECORD_EVALUATE); Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue());
xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT, outputSchemaText);
xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_EVALUATE.getValue());
TestRunner runner = setup(xmlReaderProperties);
runner.enableControllerService(reader); try (InputStream is = new FileInputStream("src/test/resources/xml/people.xml")) {
runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, "true"));
InputStream is = new FileInputStream("src/test/resources/xml/people.xml"); runner.run();
runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, "true")); }
runner.run();
List<MockFlowFile> flowFile = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS); List<MockFlowFile> flowFile = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS);
List<String> records = Arrays.asList((new String(runner.getContentAsByteArray(flowFile.get(0)))).split("\n")); List<String> records = Arrays.asList((new String(runner.getContentAsByteArray(flowFile.get(0)))).split("\n"));
@ -76,16 +83,20 @@ public class TestXMLReader {
} }
@Test @Test
public void testRecordFormat2() throws IOException, InitializationException { public void testRecordFormatArray() throws IOException, InitializationException {
TestRunner runner = setup("src/test/resources/xml/testschema"); String outputSchemaPath = "src/test/resources/xml/testschema";
String outputSchemaText = new String(Files.readAllBytes(Paths.get(outputSchemaPath)));
runner.setProperty(reader, XMLReader.RECORD_FORMAT, XMLReader.RECORD_ARRAY); Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue());
xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT, outputSchemaText);
xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_ARRAY.getValue());
TestRunner runner = setup(xmlReaderProperties);
runner.enableControllerService(reader); try (InputStream is = new FileInputStream("src/test/resources/xml/people.xml")) {
runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, "true"));
InputStream is = new FileInputStream("src/test/resources/xml/people.xml"); runner.run();
runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, "true")); }
runner.run();
List<MockFlowFile> flowFile = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS); List<MockFlowFile> flowFile = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS);
List<String> records = Arrays.asList((new String(runner.getContentAsByteArray(flowFile.get(0)))).split("\n")); List<String> records = Arrays.asList((new String(runner.getContentAsByteArray(flowFile.get(0)))).split("\n"));
@ -94,16 +105,20 @@ public class TestXMLReader {
} }
@Test @Test
public void testRecordFormat3() throws IOException, InitializationException { public void testRecordFormatNotArray() throws IOException, InitializationException {
TestRunner runner = setup("src/test/resources/xml/testschema"); String outputSchemaPath = "src/test/resources/xml/testschema";
String outputSchemaText = new String(Files.readAllBytes(Paths.get(outputSchemaPath)));
runner.setProperty(reader, XMLReader.RECORD_FORMAT, XMLReader.RECORD_SINGLE); Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue());
xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT, outputSchemaText);
xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_SINGLE.getValue());
TestRunner runner = setup(xmlReaderProperties);
runner.enableControllerService(reader); try (InputStream is = new FileInputStream("src/test/resources/xml/person.xml")) {
runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, "true"));
InputStream is = new FileInputStream("src/test/resources/xml/person.xml"); runner.run();
runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, "true")); }
runner.run();
List<MockFlowFile> flowFile = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS); List<MockFlowFile> flowFile = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS);
List<String> records = Arrays.asList(new String(runner.getContentAsByteArray(flowFile.get(0))).split("\n")); List<String> records = Arrays.asList(new String(runner.getContentAsByteArray(flowFile.get(0))).split("\n"));
@ -113,16 +128,20 @@ public class TestXMLReader {
@Test @Test
public void testAttributePrefix() throws IOException, InitializationException { public void testAttributePrefix() throws IOException, InitializationException {
TestRunner runner = setup("src/test/resources/xml/testschema"); String outputSchemaPath = "src/test/resources/xml/testschema";
String outputSchemaText = new String(Files.readAllBytes(Paths.get(outputSchemaPath)));
runner.setProperty(reader, XMLReader.ATTRIBUTE_PREFIX, "${" + ATTRIBUTE_PREFIX + "}"); Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
runner.setProperty(reader, XMLReader.RECORD_FORMAT, XMLReader.RECORD_ARRAY); xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue());
xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT, outputSchemaText);
xmlReaderProperties.put(XMLReader.ATTRIBUTE_PREFIX, "${" + ATTRIBUTE_PREFIX + "}");
xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_ARRAY.getValue());
TestRunner runner = setup(xmlReaderProperties);
runner.enableControllerService(reader); try (InputStream is = new FileInputStream("src/test/resources/xml/people.xml")) {
runner.enqueue(is, Collections.singletonMap(ATTRIBUTE_PREFIX, "ATTR_"));
InputStream is = new FileInputStream("src/test/resources/xml/people.xml"); runner.run();
runner.enqueue(is, Collections.singletonMap(ATTRIBUTE_PREFIX, "ATTR_")); }
runner.run();
List<MockFlowFile> flowFile = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS); List<MockFlowFile> flowFile = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS);
List<String> records = Arrays.asList(new String(runner.getContentAsByteArray(flowFile.get(0))).split("\n")); List<String> records = Arrays.asList(new String(runner.getContentAsByteArray(flowFile.get(0))).split("\n"));
@ -136,16 +155,20 @@ public class TestXMLReader {
@Test @Test
public void testContentField() throws IOException, InitializationException { public void testContentField() throws IOException, InitializationException {
TestRunner runner = setup("src/test/resources/xml/testschema2"); String outputSchemaPath = "src/test/resources/xml/testschema2";
String outputSchemaText = new String(Files.readAllBytes(Paths.get(outputSchemaPath)));
runner.setProperty(reader, XMLReader.CONTENT_FIELD_NAME, "${" + CONTENT_NAME + "}"); Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
runner.setProperty(reader, XMLReader.RECORD_FORMAT, XMLReader.RECORD_ARRAY); xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue());
xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT, outputSchemaText);
xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, "${" + CONTENT_NAME + "}");
xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_ARRAY.getValue());
TestRunner runner = setup(xmlReaderProperties);
runner.enableControllerService(reader); try (InputStream is = new FileInputStream("src/test/resources/xml/people_tag_in_characters.xml")) {
runner.enqueue(is, Collections.singletonMap(CONTENT_NAME, "CONTENT"));
InputStream is = new FileInputStream("src/test/resources/xml/people_tag_in_characters.xml"); runner.run();
runner.enqueue(is, Collections.singletonMap(CONTENT_NAME, "CONTENT")); }
runner.run();
List<MockFlowFile> flowFile = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS); List<MockFlowFile> flowFile = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS);
List<String> records = Arrays.asList(new String(runner.getContentAsByteArray(flowFile.get(0))).split("\n")); List<String> records = Arrays.asList(new String(runner.getContentAsByteArray(flowFile.get(0))).split("\n"));
@ -157,4 +180,103 @@ public class TestXMLReader {
assertEquals("MapRecord[{ID=P4, NAME=MapRecord[{CONTENT=Elenora Scrivens, ATTR=attr content, INNER=inner content}], AGE=16}]", records.get(3)); assertEquals("MapRecord[{ID=P4, NAME=MapRecord[{CONTENT=Elenora Scrivens, ATTR=attr content, INNER=inner content}], AGE=16}]", records.get(3));
assertEquals("MapRecord[{ID=P5, NAME=MapRecord[{INNER=inner content}]}]", records.get(4)); assertEquals("MapRecord[{ID=P5, NAME=MapRecord[{INNER=inner content}]}]", records.get(4));
} }
@Test
public void testInferSchema() throws InitializationException, IOException {
String expectedContent = "MapRecord[{software=MapRecord[{" + CONTENT_NAME + "=Apache NiFi, favorite=true}], num=123, name=John Doe}]";
Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaInferenceUtil.INFER_SCHEMA.getValue());
xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_SINGLE.getValue());
xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, CONTENT_NAME);
TestRunner runner = setup(xmlReaderProperties);
try (InputStream is = new FileInputStream("src/test/resources/xml/person_record.xml")) {
runner.enqueue(is);
runner.run();
}
MockFlowFile out = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0);
String actualContent = out.getContent();
assertEquals(expectedContent, actualContent);
}
@Test
public void testInferSchemaContentFieldNameNotSet() throws InitializationException, IOException {
String expectedContent = "MapRecord[{software=MapRecord[{favorite=true}], num=123, name=John Doe}]";
Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaInferenceUtil.INFER_SCHEMA.getValue());
xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_SINGLE.getValue());
TestRunner runner = setup(xmlReaderProperties);
try (InputStream is = new FileInputStream("src/test/resources/xml/person_record.xml")) {
runner.enqueue(is);
runner.run();
}
MockFlowFile out = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0);
String actualContent = out.getContent();
assertEquals(expectedContent, actualContent);
}
@Test
public void testInferSchemaContentFieldNameNotSetSubElementExists() throws InitializationException, IOException {
String expectedContent = "MapRecord[{field_with_attribute=MapRecord[{attr=attr_content, value=123}]}]";
Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaInferenceUtil.INFER_SCHEMA.getValue());
xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_SINGLE.getValue());
TestRunner runner = setup(xmlReaderProperties);
try (InputStream is = new FileInputStream("src/test/resources/xml/field_with_sub-element.xml")) {
runner.enqueue(is);
runner.run();
}
MockFlowFile out = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0);
String actualContent = out.getContent();
assertEquals(expectedContent, actualContent);
}
@Test
public void testInferSchemaContentFieldNameSetSubElementExistsNameClash() throws InitializationException, IOException {
String expectedContent = "MapRecord[{field_with_attribute=MapRecord[{attr=attr_content, value=content of field}]}]";
Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaInferenceUtil.INFER_SCHEMA.getValue());
xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_SINGLE.getValue());
xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, "value");
TestRunner runner = setup(xmlReaderProperties);
try (InputStream is = new FileInputStream("src/test/resources/xml/field_with_sub-element.xml")) {
runner.enqueue(is);
runner.run();
}
MockFlowFile out = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0);
String actualContent = out.getContent();
assertEquals(expectedContent, actualContent);
}
@Test
public void testInferSchemaContentFieldNameSetSubElementExistsNoNameClash() throws InitializationException, IOException {
String expectedContent = "MapRecord[{field_with_attribute=MapRecord[{" +CONTENT_NAME + "=content of field, " +
"attr=attr_content, value=123}]}]";
Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaInferenceUtil.INFER_SCHEMA.getValue());
xmlReaderProperties.put(XMLReader.RECORD_FORMAT, XMLReader.RECORD_SINGLE.getValue());
xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, CONTENT_NAME);
TestRunner runner = setup(xmlReaderProperties);
try (InputStream is = new FileInputStream("src/test/resources/xml/field_with_sub-element.xml")) {
runner.enqueue(is);
runner.run();
}
MockFlowFile out = runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0);
String actualContent = out.getContent();
assertEquals(expectedContent, actualContent);
}
} }

View File

@ -0,0 +1,4 @@
<record>
<field_with_attribute attr="attr_content">content of field<value>123</value>
</field_with_attribute>
</record>

View File

@ -0,0 +1,5 @@
<record>
<num>123</num>
<name>John Doe</name>
<software favorite="true">Apache NiFi</software>
</record>