From 44d4b882bf90c6fc8a9c924e17ce3256f9f0e486 Mon Sep 17 00:00:00 2001 From: Pierre Villard Date: Thu, 22 Sep 2016 20:43:42 +0200 Subject: [PATCH] NIFI-2071 - Support repeating capture groups in ExtractText This closes #1050. Signed-off-by: Koji Kawamura --- .../nifi/processors/standard/ExtractText.java | 32 ++++++++--- .../processors/standard/TestExtractText.java | 54 ++++++++++++++++++- 2 files changed, 78 insertions(+), 8 deletions(-) diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractText.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractText.java index a4370b8ab7..e1d641d70d 100644 --- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractText.java +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractText.java @@ -72,8 +72,9 @@ import org.apache.nifi.stream.io.StreamUtils; + "\"abc(def)?(g)\" we would add an attribute \"regex.1\" with a value of \"def\" if the \"def\" matched. If " + "the \"def\" did not match, no attribute named \"regex.1\" would be added but an attribute named \"regex.2\" " + "with a value of \"g\" will be added regardless." - + "The value of the property must be a valid Regular Expressions with one or more capturing groups. " - + "If the Regular Expression matches more than once, only the first match will be used. " + + "The value of the property must be a valid Regular Expressions with one or more capturing groups. " + + "If the Regular Expression matches more than once, only the first match will be used unless the property " + + "enabling repeating capture group is set to true. " + "If any provided Regular Expression matches, the FlowFile(s) will be routed to 'matched'. " + "If no provided Regular Expression matches, the FlowFile will be routed to 'unmatched' " + "and no attributes will be applied to the FlowFile.") @@ -193,6 +194,16 @@ public class ExtractText extends AbstractProcessor { .defaultValue("true") .build(); + public static final PropertyDescriptor ENABLE_REPEATING_CAPTURE_GROUP = new PropertyDescriptor.Builder() + .name("extract-text-enable-repeating-capture-group") + .displayName("Enable repeating capture group") + .description("If set to true, every string matching the capture groups will be extracted. Otherwise, " + + "if the Regular Expression matches more than once, only the first match will be extracted.") + .required(true) + .allowableValues("true", "false") + .defaultValue("false") + .build(); + public static final Relationship REL_MATCH = new Relationship.Builder() .name("matched") .description("FlowFiles are routed to this relationship when the Regular Expression is successfully evaluated and the FlowFile is modified as a result") @@ -229,6 +240,7 @@ public class ExtractText extends AbstractProcessor { props.add(UNICODE_CHARACTER_CLASS); props.add(UNIX_LINES); props.add(INCLUDE_CAPTURE_GROUP_ZERO); + props.add(ENABLE_REPEATING_CAPTURE_GROUP); this.properties = Collections.unmodifiableList(props); } @@ -320,22 +332,28 @@ public class ExtractText extends AbstractProcessor { for (final Map.Entry entry : patternMap.entrySet()) { final Matcher matcher = entry.getValue().matcher(contentString); + int j = 0; - if (matcher.find()) { + while (matcher.find()) { final String baseKey = entry.getKey(); - for (int i = startGroupIdx; i <= matcher.groupCount(); i++) { - final String key = new StringBuilder(baseKey).append(".").append(i).toString(); + int start = j == 0 ? startGroupIdx : 1; + for (int i = start; i <= matcher.groupCount(); i++) { + final String key = new StringBuilder(baseKey).append(".").append(i+j).toString(); String value = matcher.group(i); - if (value != null) { + if (value != null && !value.isEmpty()) { if (value.length() > maxCaptureGroupLength) { value = value.substring(0, maxCaptureGroupLength); } regexResults.put(key, value); - if (i == 1) { + if (i == 1 && j == 0) { regexResults.put(baseKey, value); } } } + j += matcher.groupCount(); + if(!context.getProperty(ENABLE_REPEATING_CAPTURE_GROUP).asBoolean()) { + break; + } } } diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractText.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractText.java index e5022b73b9..93d5147089 100644 --- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractText.java +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractText.java @@ -27,7 +27,6 @@ import org.apache.nifi.processor.Relationship; import org.apache.nifi.util.MockFlowFile; import org.apache.nifi.util.TestRunner; import org.apache.nifi.util.TestRunners; - import org.junit.Test; public class TestExtractText { @@ -350,6 +349,59 @@ public class TestExtractText { out.assertAttributeEquals(attributeKey, SAMPLE_STRING); } + @Test + public void testFindAll() throws Exception { + final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText()); + testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true"); + final String attributeKey = "regex.result"; + testRunner.setProperty(attributeKey, "(?s)(\\w+)"); + testRunner.enqueue("This is my text".getBytes("UTF-8")); + testRunner.run(); + testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1); + final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0); + // Ensure the zero capture group is in the resultant attributes + out.assertAttributeExists(attributeKey + ".0"); + out.assertAttributeExists(attributeKey + ".1"); + out.assertAttributeExists(attributeKey + ".2"); + out.assertAttributeExists(attributeKey + ".3"); + out.assertAttributeExists(attributeKey + ".4"); + out.assertAttributeEquals(attributeKey, "This"); + out.assertAttributeEquals(attributeKey + ".0", "This"); + out.assertAttributeEquals(attributeKey + ".1", "This"); + out.assertAttributeEquals(attributeKey + ".2", "is"); + out.assertAttributeEquals(attributeKey + ".3", "my"); + out.assertAttributeEquals(attributeKey + ".4", "text"); + } + + @Test + public void testFindAllPair() throws Exception { + final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText()); + testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true"); + final String attributeKey = "regex.result"; + testRunner.setProperty(attributeKey, "(\\w+)=(\\d+)"); + testRunner.enqueue("a=1,b=10,c=100".getBytes("UTF-8")); + testRunner.run(); + testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1); + final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0); + // Ensure the zero capture group is in the resultant attributes + out.assertAttributeExists(attributeKey + ".0"); + out.assertAttributeExists(attributeKey + ".1"); + out.assertAttributeExists(attributeKey + ".2"); + out.assertAttributeExists(attributeKey + ".3"); + out.assertAttributeExists(attributeKey + ".4"); + out.assertAttributeExists(attributeKey + ".5"); + out.assertAttributeExists(attributeKey + ".6"); + out.assertAttributeNotExists(attributeKey + ".7"); // Ensure there's no more attributes + out.assertAttributeEquals(attributeKey, "a"); + out.assertAttributeEquals(attributeKey + ".0", "a=1"); + out.assertAttributeEquals(attributeKey + ".1", "a"); + out.assertAttributeEquals(attributeKey + ".2", "1"); + out.assertAttributeEquals(attributeKey + ".3", "b"); + out.assertAttributeEquals(attributeKey + ".4", "10"); + out.assertAttributeEquals(attributeKey + ".5", "c"); + out.assertAttributeEquals(attributeKey + ".6", "100"); + } + @Test public void testIgnoreZeroCaptureGroupProperty() throws Exception { final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());