NIFI-4095 Changed minimum capture group count in ExtractText from 1 to 0.

Added unit test and removed obsolete test.
Added custom validation to enforce capture group if "include capture group 0" is false.
This commit is contained in:
Andy LoPresto 2017-06-20 16:23:19 -04:00 committed by Pierre Villard
parent 5c755c006b
commit 253ea2e73b
2 changed files with 86 additions and 31 deletions

View File

@ -20,6 +20,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
@ -31,7 +32,6 @@ import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.nifi.annotation.behavior.DynamicProperty;
import org.apache.nifi.annotation.behavior.EventDriven;
import org.apache.nifi.annotation.behavior.InputRequirement;
@ -43,6 +43,9 @@ import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.annotation.lifecycle.OnScheduled;
import org.apache.nifi.annotation.lifecycle.OnStopped;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.components.ValidationContext;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.components.Validator;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.AbstractProcessor;
@ -62,26 +65,26 @@ import org.apache.nifi.stream.io.StreamUtils;
@Tags({"evaluate", "extract", "Text", "Regular Expression", "regex"})
@CapabilityDescription(
"Evaluates one or more Regular Expressions against the content of a FlowFile. "
+ "The results of those Regular Expressions are assigned to FlowFile Attributes. "
+ "Regular Expressions are entered by adding user-defined properties; "
+ "the name of the property maps to the Attribute Name into which the result will be placed. "
+ "The first capture group, if any found, will be placed into that attribute name."
+ "But all capture groups, including the matching string sequence itself will also be "
+ "provided at that attribute name with an index value provided, with the exception of a capturing group "
+ "that is optional and does not match - for example, given the attribute name \"regex\" and expression "
+ "\"abc(def)?(g)\" we would add an attribute \"regex.1\" with a value of \"def\" if the \"def\" matched. If "
+ "the \"def\" did not match, no attribute named \"regex.1\" would be added but an attribute named \"regex.2\" "
+ "with a value of \"g\" will be added regardless."
+ "The value of the property must be a valid Regular Expressions with one or more capturing groups. "
+ "If the Regular Expression matches more than once, only the first match will be used unless the property "
+ "enabling repeating capture group is set to true. "
+ "If any provided Regular Expression matches, the FlowFile(s) will be routed to 'matched'. "
+ "If no provided Regular Expression matches, the FlowFile will be routed to 'unmatched' "
+ "and no attributes will be applied to the FlowFile.")
+ "The results of those Regular Expressions are assigned to FlowFile Attributes. "
+ "Regular Expressions are entered by adding user-defined properties; "
+ "the name of the property maps to the Attribute Name into which the result will be placed. "
+ "The first capture group, if any found, will be placed into that attribute name."
+ "But all capture groups, including the matching string sequence itself will also be "
+ "provided at that attribute name with an index value provided, with the exception of a capturing group "
+ "that is optional and does not match - for example, given the attribute name \"regex\" and expression "
+ "\"abc(def)?(g)\" we would add an attribute \"regex.1\" with a value of \"def\" if the \"def\" matched. If "
+ "the \"def\" did not match, no attribute named \"regex.1\" would be added but an attribute named \"regex.2\" "
+ "with a value of \"g\" will be added regardless."
+ "The value of the property must be a valid Regular Expressions with one or more capturing groups. "
+ "If the Regular Expression matches more than once, only the first match will be used unless the property "
+ "enabling repeating capture group is set to true. "
+ "If any provided Regular Expression matches, the FlowFile(s) will be routed to 'matched'. "
+ "If no provided Regular Expression matches, the FlowFile will be routed to 'unmatched' "
+ "and no attributes will be applied to the FlowFile.")
@DynamicProperty(name = "A FlowFile attribute", value = "A Regular Expression with one or more capturing group",
description = "The first capture group, if any found, will be placed into that attribute name."
+ "But all capture groups, including the matching string sequence itself will also be "
+ "provided at that attribute name with an index value provided.")
+ "But all capture groups, including the matching string sequence itself will also be "
+ "provided at that attribute name with an index value provided.")
public class ExtractText extends AbstractProcessor {
public static final PropertyDescriptor CHARACTER_SET = new PropertyDescriptor.Builder()
@ -259,12 +262,38 @@ public class ExtractText extends AbstractProcessor {
return new PropertyDescriptor.Builder()
.name(propertyDescriptorName)
.expressionLanguageSupported(false)
.addValidator(StandardValidators.createRegexValidator(1, 40, true))
.addValidator(StandardValidators.createRegexValidator(0, 40, true))
.required(false)
.dynamic(true)
.build();
}
@Override
protected Collection<ValidationResult> customValidate(final ValidationContext validationContext) {
final List<ValidationResult> problems = new ArrayList<>(super.customValidate(validationContext));
// If the capture group zero is not going to be included, each dynamic property must have at least one group
final boolean includeCaptureGroupZero = validationContext.getProperty(INCLUDE_CAPTURE_GROUP_ZERO).getValue().equalsIgnoreCase("true");
getLogger().debug("Include capture group zero is " + includeCaptureGroupZero);
if (!includeCaptureGroupZero) {
final Validator oneGroupMinimumValidator = StandardValidators.createRegexValidator(1, 40, true);
for (Map.Entry<PropertyDescriptor, String> prop : validationContext.getProperties().entrySet()) {
PropertyDescriptor pd = prop.getKey();
if (pd.isDynamic()) {
String value = validationContext.getProperty(pd).getValue();
getLogger().debug("Evaluating dynamic property " + pd.getDisplayName() + " (" + pd.getName() + ") with value " + value);
ValidationResult result = oneGroupMinimumValidator.validate(pd.getDisplayName(), value, validationContext);
getLogger().debug("Validation result: " + result.toString());
if (!result.isValid()) {
problems.add(result);
}
}
}
}
return problems;
}
@OnScheduled
public final void onScheduled(final ProcessContext context) throws IOException {
final Map<String, Pattern> compiledPatternsMap = new HashMap<>();
@ -338,7 +367,7 @@ public class ExtractText extends AbstractProcessor {
final String baseKey = entry.getKey();
int start = j == 0 ? startGroupIdx : 1;
for (int i = start; i <= matcher.groupCount(); i++) {
final String key = new StringBuilder(baseKey).append(".").append(i+j).toString();
final String key = new StringBuilder(baseKey).append(".").append(i + j).toString();
String value = matcher.group(i);
if (value != null && !value.isEmpty()) {
if (value.length() > maxCaptureGroupLength) {
@ -351,7 +380,7 @@ public class ExtractText extends AbstractProcessor {
}
}
j += matcher.groupCount();
if(!context.getProperty(ENABLE_REPEATING_CAPTURE_GROUP).asBoolean()) {
if (!context.getProperty(ENABLE_REPEATING_CAPTURE_GROUP).asBoolean()) {
break;
}
}

View File

@ -22,7 +22,6 @@ import static org.junit.Assert.assertTrue;
import java.io.UnsupportedEncodingException;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
@ -216,14 +215,6 @@ public class TestExtractText {
out.assertAttributeEquals("regex.result7", null);
}
@Test(expected = java.lang.AssertionError.class)
public void testNoCaptureGroups() throws UnsupportedEncodingException {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty("regex.result1", ".*");
testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
testRunner.run();
}
@Test
public void testNoFlowFile() throws UnsupportedEncodingException {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
@ -422,4 +413,39 @@ public class TestExtractText {
out.assertAttributeNotExists(attributeKey + ".0");
out.assertAttributeEquals(attributeKey, SAMPLE_STRING);
}
@Test
public void testShouldAllowNoCaptureGroups() throws Exception {
// Arrange
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(?s).*");
// Act
testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
testRunner.run();
// Assert
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
// There is no global capture group, so only "key.0" exists
out.assertAttributeNotExists(attributeKey);
out.assertAttributeEquals(attributeKey + ".0", SAMPLE_STRING);
}
@Test(expected = java.lang.AssertionError.class)
public void testShouldNotAllowNoCaptureGroupsIfZeroDisabled() throws Exception {
// Arrange
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ExtractText.INCLUDE_CAPTURE_GROUP_ZERO, "false");
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(?s).*");
// Act
testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
// Validation should fail because nothing will match
testRunner.run();
}
}