NIFI-2071 - Support repeating capture groups in ExtractText

This closes #1050.

Signed-off-by: Koji Kawamura <ijokarumawak@apache.org>
This commit is contained in:
Pierre Villard 2016-09-22 20:43:42 +02:00 committed by Koji Kawamura
parent 56f7cd085f
commit 44d4b882bf
2 changed files with 78 additions and 8 deletions

View File

@ -72,8 +72,9 @@ import org.apache.nifi.stream.io.StreamUtils;
+ "\"abc(def)?(g)\" we would add an attribute \"regex.1\" with a value of \"def\" if the \"def\" matched. If "
+ "the \"def\" did not match, no attribute named \"regex.1\" would be added but an attribute named \"regex.2\" "
+ "with a value of \"g\" will be added regardless."
+ "The value of the property must be a valid Regular Expressions with one or more capturing groups. "
+ "If the Regular Expression matches more than once, only the first match will be used. "
+ "The value of the property must be a valid Regular Expressions with one or more capturing groups. "
+ "If the Regular Expression matches more than once, only the first match will be used unless the property "
+ "enabling repeating capture group is set to true. "
+ "If any provided Regular Expression matches, the FlowFile(s) will be routed to 'matched'. "
+ "If no provided Regular Expression matches, the FlowFile will be routed to 'unmatched' "
+ "and no attributes will be applied to the FlowFile.")
@ -193,6 +194,16 @@ public class ExtractText extends AbstractProcessor {
.defaultValue("true")
.build();
public static final PropertyDescriptor ENABLE_REPEATING_CAPTURE_GROUP = new PropertyDescriptor.Builder()
.name("extract-text-enable-repeating-capture-group")
.displayName("Enable repeating capture group")
.description("If set to true, every string matching the capture groups will be extracted. Otherwise, "
+ "if the Regular Expression matches more than once, only the first match will be extracted.")
.required(true)
.allowableValues("true", "false")
.defaultValue("false")
.build();
public static final Relationship REL_MATCH = new Relationship.Builder()
.name("matched")
.description("FlowFiles are routed to this relationship when the Regular Expression is successfully evaluated and the FlowFile is modified as a result")
@ -229,6 +240,7 @@ public class ExtractText extends AbstractProcessor {
props.add(UNICODE_CHARACTER_CLASS);
props.add(UNIX_LINES);
props.add(INCLUDE_CAPTURE_GROUP_ZERO);
props.add(ENABLE_REPEATING_CAPTURE_GROUP);
this.properties = Collections.unmodifiableList(props);
}
@ -320,22 +332,28 @@ public class ExtractText extends AbstractProcessor {
for (final Map.Entry<String, Pattern> entry : patternMap.entrySet()) {
final Matcher matcher = entry.getValue().matcher(contentString);
int j = 0;
if (matcher.find()) {
while (matcher.find()) {
final String baseKey = entry.getKey();
for (int i = startGroupIdx; i <= matcher.groupCount(); i++) {
final String key = new StringBuilder(baseKey).append(".").append(i).toString();
int start = j == 0 ? startGroupIdx : 1;
for (int i = start; i <= matcher.groupCount(); i++) {
final String key = new StringBuilder(baseKey).append(".").append(i+j).toString();
String value = matcher.group(i);
if (value != null) {
if (value != null && !value.isEmpty()) {
if (value.length() > maxCaptureGroupLength) {
value = value.substring(0, maxCaptureGroupLength);
}
regexResults.put(key, value);
if (i == 1) {
if (i == 1 && j == 0) {
regexResults.put(baseKey, value);
}
}
}
j += matcher.groupCount();
if(!context.getProperty(ENABLE_REPEATING_CAPTURE_GROUP).asBoolean()) {
break;
}
}
}

View File

@ -27,7 +27,6 @@ import org.apache.nifi.processor.Relationship;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.junit.Test;
public class TestExtractText {
@ -350,6 +349,59 @@ public class TestExtractText {
out.assertAttributeEquals(attributeKey, SAMPLE_STRING);
}
@Test
public void testFindAll() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(?s)(\\w+)");
testRunner.enqueue("This is my text".getBytes("UTF-8"));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
// Ensure the zero capture group is in the resultant attributes
out.assertAttributeExists(attributeKey + ".0");
out.assertAttributeExists(attributeKey + ".1");
out.assertAttributeExists(attributeKey + ".2");
out.assertAttributeExists(attributeKey + ".3");
out.assertAttributeExists(attributeKey + ".4");
out.assertAttributeEquals(attributeKey, "This");
out.assertAttributeEquals(attributeKey + ".0", "This");
out.assertAttributeEquals(attributeKey + ".1", "This");
out.assertAttributeEquals(attributeKey + ".2", "is");
out.assertAttributeEquals(attributeKey + ".3", "my");
out.assertAttributeEquals(attributeKey + ".4", "text");
}
@Test
public void testFindAllPair() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(\\w+)=(\\d+)");
testRunner.enqueue("a=1,b=10,c=100".getBytes("UTF-8"));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
// Ensure the zero capture group is in the resultant attributes
out.assertAttributeExists(attributeKey + ".0");
out.assertAttributeExists(attributeKey + ".1");
out.assertAttributeExists(attributeKey + ".2");
out.assertAttributeExists(attributeKey + ".3");
out.assertAttributeExists(attributeKey + ".4");
out.assertAttributeExists(attributeKey + ".5");
out.assertAttributeExists(attributeKey + ".6");
out.assertAttributeNotExists(attributeKey + ".7"); // Ensure there's no more attributes
out.assertAttributeEquals(attributeKey, "a");
out.assertAttributeEquals(attributeKey + ".0", "a=1");
out.assertAttributeEquals(attributeKey + ".1", "a");
out.assertAttributeEquals(attributeKey + ".2", "1");
out.assertAttributeEquals(attributeKey + ".3", "b");
out.assertAttributeEquals(attributeKey + ".4", "10");
out.assertAttributeEquals(attributeKey + ".5", "c");
out.assertAttributeEquals(attributeKey + ".6", "100");
}
@Test
public void testIgnoreZeroCaptureGroupProperty() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());