mirror of https://github.com/apache/nifi.git
NIFI-7969: ValidateRecord enhanced with Force Types From Schema property
NIFI-7969: Documentation update Clarified that the Force Types From Schema property applies to the data read, whereas the Strict Type Checking property applies to the validation. NIFI-7969: Documentation update - updated the property name in additionalDetails.html This closes #4825. Signed-off-by: Peter Turcsanyi <turcsanyi@apache.org>
This commit is contained in:
parent
5608f4389a
commit
a54f739229
|
@ -172,14 +172,29 @@ public class ValidateRecord extends AbstractProcessor {
|
|||
static final PropertyDescriptor STRICT_TYPE_CHECKING = new PropertyDescriptor.Builder()
|
||||
.name("strict-type-checking")
|
||||
.displayName("Strict Type Checking")
|
||||
.description("If the incoming data has a Record where a field is not of the correct type, this property determine whether how to handle the Record. "
|
||||
+ "If true, the Record will still be considered invalid. If false, the Record will be considered valid and the field will be coerced into the "
|
||||
+ "correct type (if possible, according to the type coercion supported by the Record Writer).")
|
||||
.description("If the incoming data has a Record where a field is not of the correct type, this property determines how to handle the Record. "
|
||||
+ "If true, the Record will be considered invalid. If false, the Record will be considered valid and the field will be coerced into the "
|
||||
+ "correct type (if possible, according to the type coercion supported by the Record Writer). "
|
||||
+ "This property controls how the data is validated against the validation schema.")
|
||||
.expressionLanguageSupported(ExpressionLanguageScope.NONE)
|
||||
.allowableValues("true", "false")
|
||||
.defaultValue("true")
|
||||
.required(true)
|
||||
.build();
|
||||
static final PropertyDescriptor COERCE_TYPES = new PropertyDescriptor.Builder()
|
||||
.name("coerce-types")
|
||||
.displayName("Force Types From Reader's Schema")
|
||||
.description("If enabled, the processor will coerce every field to the type specified in the Reader's schema. "
|
||||
+ "If the value of a field cannot be coerced to the type, the field will be skipped (will not be read from the input data), "
|
||||
+ "thus will not appear in the output. "
|
||||
+ "If not enabled, then every field will appear in the output but their types may differ from what is "
|
||||
+ "specified in the schema. For details please see the Additional Details page of the processor's Help. "
|
||||
+ "This property controls how the data is read by the specified Record Reader.")
|
||||
.expressionLanguageSupported(ExpressionLanguageScope.NONE)
|
||||
.allowableValues("true", "false")
|
||||
.defaultValue("false")
|
||||
.required(true)
|
||||
.build();
|
||||
static final PropertyDescriptor VALIDATION_DETAILS_ATTRIBUTE_NAME = new PropertyDescriptor.Builder()
|
||||
.name("validation-details-attribute-name")
|
||||
.displayName("Validation Details Attribute Name")
|
||||
|
@ -227,6 +242,7 @@ public class ValidateRecord extends AbstractProcessor {
|
|||
properties.add(SCHEMA_TEXT);
|
||||
properties.add(ALLOW_EXTRA_FIELDS);
|
||||
properties.add(STRICT_TYPE_CHECKING);
|
||||
properties.add(COERCE_TYPES);
|
||||
properties.add(VALIDATION_DETAILS_ATTRIBUTE_NAME);
|
||||
properties.add(MAX_VALIDATION_DETAILS_LENGTH);
|
||||
return properties;
|
||||
|
@ -282,6 +298,7 @@ public class ValidateRecord extends AbstractProcessor {
|
|||
|
||||
final boolean allowExtraFields = context.getProperty(ALLOW_EXTRA_FIELDS).asBoolean();
|
||||
final boolean strictTypeChecking = context.getProperty(STRICT_TYPE_CHECKING).asBoolean();
|
||||
final boolean coerceTypes = context.getProperty(COERCE_TYPES).asBoolean();
|
||||
|
||||
RecordSetWriter validWriter = null;
|
||||
RecordSetWriter invalidWriter = null;
|
||||
|
@ -306,7 +323,7 @@ public class ValidateRecord extends AbstractProcessor {
|
|||
|
||||
try {
|
||||
Record record;
|
||||
while ((record = reader.nextRecord(false, false)) != null) {
|
||||
while ((record = reader.nextRecord(coerceTypes, false)) != null) {
|
||||
final SchemaValidationResult result = validator.validate(record);
|
||||
recordCount++;
|
||||
|
||||
|
|
|
@ -0,0 +1,190 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<head>
|
||||
<meta charset="utf-8"/>
|
||||
<title>ValidateRecord</title>
|
||||
<link rel="stylesheet" href="../../../../../css/component-usage.css" type="text/css"/>
|
||||
<style>
|
||||
table td:first-child {text-align: center;}
|
||||
</style>
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<h2>Examples for the effect of Force Types From Reader's Schema property</h2>
|
||||
|
||||
<p>
|
||||
The processor first reads the data from the incoming FlowFile using the specified Record Reader,
|
||||
which uses a schema. Then, depending on the value of the Schema Access Strategy property, the processor
|
||||
can either use the reader's schema, or a different schema to validate the data against.
|
||||
After that, the processor writes the data into the outgoing FlowFile using the specified
|
||||
Record Writer. If the data is valid, the validation schema is used by the writer.
|
||||
If the data is invalid, the writer uses the reader's schema.
|
||||
The <b>Force Types From Reader's Schema</b> property affects the first step:
|
||||
how strictly the reader's schema should be applied when reading the data from the incoming FlowFile.
|
||||
By affecting how the data is read, the value of the Force Types From Reader's Schema property also has an effect on what
|
||||
the output of the ValidateRecord processor is,
|
||||
and also whether the output is forwarded to the <b>valid</b> or the <b>invalid</b> relationship.
|
||||
Below are two examples where the value of this property affects the output significantly.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
In both examples the input is in XML format and the output is in JSON.
|
||||
In the examples we assume that the same schema is used for reading, validation and writing.
|
||||
</p>
|
||||
|
||||
<h3>Example 1</h3>
|
||||
|
||||
<p>
|
||||
Schema:
|
||||
</p>
|
||||
<pre><code>{
|
||||
"namespace": "nifi",
|
||||
"name": "test",
|
||||
"type": "record",
|
||||
"fields": [
|
||||
{ "name": "field1", "type": "string" },
|
||||
{ "name": "field2", "type": "string" }
|
||||
]
|
||||
}
|
||||
</code></pre>
|
||||
|
||||
<p>
|
||||
Input:
|
||||
</p>
|
||||
<pre><code><test>
|
||||
<field1>
|
||||
<sub_field>content</sub_field>
|
||||
</field1>
|
||||
<field2>content_of_field_2</field2>
|
||||
</test></code></pre>
|
||||
|
||||
<p>
|
||||
Output if <b>Force Types From Reader's Schema = true</b>
|
||||
(forwarded to the <b>invalid</b> relationship):
|
||||
</p>
|
||||
<pre><code>[ {
|
||||
"field2" : "content_of_field_2"
|
||||
} ]</code></pre>
|
||||
|
||||
<p>
|
||||
Output if <b>Force Types From Reader's Schema = false</b>
|
||||
(forwarded to the <b>invalid</b> relationship):
|
||||
</p>
|
||||
<pre><code>[ {
|
||||
"field1" : {
|
||||
"sub_field" : "content"
|
||||
},
|
||||
"field2" : "content_of_field_2"
|
||||
} ]</code></pre>
|
||||
|
||||
<p>
|
||||
As you can see, the FlowFile is forwarded to the invalid relationship in both cases,
|
||||
since the input data does not match the provided Avro schema.
|
||||
However, if <b>Force Types From Reader's Schema = true</b>, only those fields appear in the output
|
||||
that comply with the schema. If <b>Force Types From Reader's Schema = false</b>, all fields appear
|
||||
in the output regardless of whether they comply with the schema or not.
|
||||
</p>
|
||||
|
||||
<h3>Example 2</h3>
|
||||
|
||||
<p>
|
||||
Schema:
|
||||
</p>
|
||||
<pre><code>{
|
||||
"namespace": "nifi",
|
||||
"name": "test",
|
||||
"type": "record",
|
||||
"fields": [
|
||||
{
|
||||
"name": "field1",
|
||||
"type": {
|
||||
"type": "array",
|
||||
"items": "string"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "field2",
|
||||
"type": {
|
||||
"type": "array",
|
||||
"items": "string"
|
||||
}
|
||||
}
|
||||
]
|
||||
}</code></pre>
|
||||
|
||||
<p>
|
||||
Input:
|
||||
</p>
|
||||
<pre><code><test>
|
||||
<field1>content_1</field1>
|
||||
<field2>content_2</field2>
|
||||
<field2>content_3</field2>
|
||||
</test></code></pre>
|
||||
|
||||
<p>
|
||||
Output if <b>Force Types From Reader's Schema = true</b>
|
||||
(forwarded to the <b>valid</b> relationship):
|
||||
</p>
|
||||
<pre><code>[ {
|
||||
"field1" : [ "content_1" ],
|
||||
"field2" : [ "content_2", "content_3" ]
|
||||
} ]</code></pre>
|
||||
|
||||
<p>
|
||||
Output if <b>Force Types From Reader's Schema = false</b>
|
||||
(forwarded to the <b>invalid</b> relationship):
|
||||
</p>
|
||||
<pre><code>[ {
|
||||
"field1" : "content_1",
|
||||
"field2" : [ "content_2", "content_3" ]
|
||||
} ]</code></pre>
|
||||
|
||||
<p>
|
||||
The schema expects two fields (field1 and field2), both of type ARRAY.
|
||||
field1 only appears once in the input XML document. If <b>Force Types From Reader's Schema = true</b>,
|
||||
the processor forces this field to be in a type that complies with the schema.
|
||||
So it is put in an array with one element. Since this type coercion can be done,
|
||||
the output is routed to the <b>valid</b> relationship.
|
||||
If <b>Force Types From Reader's Schema = false</b> the processor does not try to apply
|
||||
type coercion, thus field1 appears in the output as a single value. According to the schema,
|
||||
the processor expects an array for field1, but receives a single element so the output
|
||||
is routed to the <b>invalid</b> relationship.
|
||||
</p>
|
||||
<p>
|
||||
Schema compliance (and getting routed to the <b>valid</b> or the <b>invalid</b> relationship)
|
||||
does not depend on what Writer is used to produce the output of the ValidateRecord processor.
|
||||
Let us suppose that we used the same schema and input as in <b>Example 2</b>, but instead of
|
||||
JsonRecordSetWriter, we used XMLRecordSetWriter to produce the output.
|
||||
Both in case of <b>Force Types From Reader's Schema = true</b> and <b>Force Types From Reader's Schema = false</b>
|
||||
the output is:
|
||||
</p>
|
||||
<pre><code><test>
|
||||
<field1>content_1</field1>
|
||||
<field2>content_2</field2>
|
||||
<field2>content_3</field2>
|
||||
</test></code></pre>
|
||||
|
||||
<p>
|
||||
However, if <b>Force Types From Reader's Schema = true</b> this output is routed to the <b>valid</b>
|
||||
relationship and if <b>Force Types From Reader's Schema = false</b> it is routed to the <b>invalid</b>
|
||||
relationship.
|
||||
</p>
|
||||
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in New Issue