From a54f73922984a859c162f1ec303ce71e7634a3ee Mon Sep 17 00:00:00 2001 From: Peter Gyori Date: Tue, 16 Feb 2021 19:37:33 +0100 Subject: [PATCH] NIFI-7969: ValidateRecord enhanced with Force Types From Schema property NIFI-7969: Documentation update Clarified that the Force Types From Schema property applies to the data read, whereas the Strict Type Checking property applies to the validation. NIFI-7969: Documentation update - updated the property name in additionalDetails.html This closes #4825. Signed-off-by: Peter Turcsanyi --- .../processors/standard/ValidateRecord.java | 25 ++- .../additionalDetails.html | 190 ++++++++++++++++++ 2 files changed, 211 insertions(+), 4 deletions(-) create mode 100644 nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.ValidateRecord/additionalDetails.html diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ValidateRecord.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ValidateRecord.java index 1210eec7a1..a952acaae8 100644 --- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ValidateRecord.java +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ValidateRecord.java @@ -172,14 +172,29 @@ public class ValidateRecord extends AbstractProcessor { static final PropertyDescriptor STRICT_TYPE_CHECKING = new PropertyDescriptor.Builder() .name("strict-type-checking") .displayName("Strict Type Checking") - .description("If the incoming data has a Record where a field is not of the correct type, this property determine whether how to handle the Record. " - + "If true, the Record will still be considered invalid. If false, the Record will be considered valid and the field will be coerced into the " - + "correct type (if possible, according to the type coercion supported by the Record Writer).") + .description("If the incoming data has a Record where a field is not of the correct type, this property determines how to handle the Record. " + + "If true, the Record will be considered invalid. If false, the Record will be considered valid and the field will be coerced into the " + + "correct type (if possible, according to the type coercion supported by the Record Writer). " + + "This property controls how the data is validated against the validation schema.") .expressionLanguageSupported(ExpressionLanguageScope.NONE) .allowableValues("true", "false") .defaultValue("true") .required(true) .build(); + static final PropertyDescriptor COERCE_TYPES = new PropertyDescriptor.Builder() + .name("coerce-types") + .displayName("Force Types From Reader's Schema") + .description("If enabled, the processor will coerce every field to the type specified in the Reader's schema. " + + "If the value of a field cannot be coerced to the type, the field will be skipped (will not be read from the input data), " + + "thus will not appear in the output. " + + "If not enabled, then every field will appear in the output but their types may differ from what is " + + "specified in the schema. For details please see the Additional Details page of the processor's Help. " + + "This property controls how the data is read by the specified Record Reader.") + .expressionLanguageSupported(ExpressionLanguageScope.NONE) + .allowableValues("true", "false") + .defaultValue("false") + .required(true) + .build(); static final PropertyDescriptor VALIDATION_DETAILS_ATTRIBUTE_NAME = new PropertyDescriptor.Builder() .name("validation-details-attribute-name") .displayName("Validation Details Attribute Name") @@ -227,6 +242,7 @@ public class ValidateRecord extends AbstractProcessor { properties.add(SCHEMA_TEXT); properties.add(ALLOW_EXTRA_FIELDS); properties.add(STRICT_TYPE_CHECKING); + properties.add(COERCE_TYPES); properties.add(VALIDATION_DETAILS_ATTRIBUTE_NAME); properties.add(MAX_VALIDATION_DETAILS_LENGTH); return properties; @@ -282,6 +298,7 @@ public class ValidateRecord extends AbstractProcessor { final boolean allowExtraFields = context.getProperty(ALLOW_EXTRA_FIELDS).asBoolean(); final boolean strictTypeChecking = context.getProperty(STRICT_TYPE_CHECKING).asBoolean(); + final boolean coerceTypes = context.getProperty(COERCE_TYPES).asBoolean(); RecordSetWriter validWriter = null; RecordSetWriter invalidWriter = null; @@ -306,7 +323,7 @@ public class ValidateRecord extends AbstractProcessor { try { Record record; - while ((record = reader.nextRecord(false, false)) != null) { + while ((record = reader.nextRecord(coerceTypes, false)) != null) { final SchemaValidationResult result = validator.validate(record); recordCount++; diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.ValidateRecord/additionalDetails.html b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.ValidateRecord/additionalDetails.html new file mode 100644 index 0000000000..c46443c2d6 --- /dev/null +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.ValidateRecord/additionalDetails.html @@ -0,0 +1,190 @@ + + + + + + ValidateRecord + + + + + + + +

Examples for the effect of Force Types From Reader's Schema property

+ +

+ The processor first reads the data from the incoming FlowFile using the specified Record Reader, + which uses a schema. Then, depending on the value of the Schema Access Strategy property, the processor + can either use the reader's schema, or a different schema to validate the data against. + After that, the processor writes the data into the outgoing FlowFile using the specified + Record Writer. If the data is valid, the validation schema is used by the writer. + If the data is invalid, the writer uses the reader's schema. + The Force Types From Reader's Schema property affects the first step: + how strictly the reader's schema should be applied when reading the data from the incoming FlowFile. + By affecting how the data is read, the value of the Force Types From Reader's Schema property also has an effect on what + the output of the ValidateRecord processor is, + and also whether the output is forwarded to the valid or the invalid relationship. + Below are two examples where the value of this property affects the output significantly. +

+ +

+ In both examples the input is in XML format and the output is in JSON. + In the examples we assume that the same schema is used for reading, validation and writing. +

+ +

Example 1

+ +

+ Schema: +

+
{
+    "namespace": "nifi",
+    "name": "test",
+    "type": "record",
+    "fields": [
+        { "name": "field1", "type": "string" },
+        { "name": "field2", "type": "string" }
+    ]
+}
+
+ +

+ Input: +

+
<test>
+    <field1>
+        <sub_field>content</sub_field>
+    </field1>
+    <field2>content_of_field_2</field2>
+</test>
+ +

+ Output if Force Types From Reader's Schema = true + (forwarded to the invalid relationship): +

+
[ {
+    "field2" : "content_of_field_2"
+} ]
+ +

+ Output if Force Types From Reader's Schema = false + (forwarded to the invalid relationship): +

+
[ {
+    "field1" : {
+        "sub_field" : "content"
+    },
+    "field2" : "content_of_field_2"
+} ]
+ +

+ As you can see, the FlowFile is forwarded to the invalid relationship in both cases, + since the input data does not match the provided Avro schema. + However, if Force Types From Reader's Schema = true, only those fields appear in the output + that comply with the schema. If Force Types From Reader's Schema = false, all fields appear + in the output regardless of whether they comply with the schema or not. +

+ +

Example 2

+ +

+ Schema: +

+
{
+    "namespace": "nifi",
+    "name": "test",
+    "type": "record",
+    "fields": [
+        {
+            "name": "field1",
+            "type": {
+                "type": "array",
+                "items": "string"
+            }
+        },
+        {
+            "name": "field2",
+            "type": {
+                "type": "array",
+                "items": "string"
+            }
+        }
+    ]
+}
+ +

+ Input: +

+
<test>
+    <field1>content_1</field1>
+    <field2>content_2</field2>
+    <field2>content_3</field2>
+</test>
+ +

+ Output if Force Types From Reader's Schema = true + (forwarded to the valid relationship): +

+
[ {
+    "field1" : [ "content_1" ],
+    "field2" : [ "content_2", "content_3" ]
+} ]
+ +

+ Output if Force Types From Reader's Schema = false + (forwarded to the invalid relationship): +

+
[ {
+    "field1" : "content_1",
+    "field2" : [ "content_2", "content_3" ]
+} ]
+ +

+ The schema expects two fields (field1 and field2), both of type ARRAY. + field1 only appears once in the input XML document. If Force Types From Reader's Schema = true, + the processor forces this field to be in a type that complies with the schema. + So it is put in an array with one element. Since this type coercion can be done, + the output is routed to the valid relationship. + If Force Types From Reader's Schema = false the processor does not try to apply + type coercion, thus field1 appears in the output as a single value. According to the schema, + the processor expects an array for field1, but receives a single element so the output + is routed to the invalid relationship. +

+

+ Schema compliance (and getting routed to the valid or the invalid relationship) + does not depend on what Writer is used to produce the output of the ValidateRecord processor. + Let us suppose that we used the same schema and input as in Example 2, but instead of + JsonRecordSetWriter, we used XMLRecordSetWriter to produce the output. + Both in case of Force Types From Reader's Schema = true and Force Types From Reader's Schema = false + the output is: +

+
<test>
+    <field1>content_1</field1>
+    <field2>content_2</field2>
+    <field2>content_3</field2>
+</test>
+ +

+ However, if Force Types From Reader's Schema = true this output is routed to the valid + relationship and if Force Types From Reader's Schema = false it is routed to the invalid + relationship. +

+ + + \ No newline at end of file