mirror of https://github.com/apache/nifi.git
NIFI-2071 - Support repeating capture groups in ExtractText
This closes #1050. Signed-off-by: Koji Kawamura <ijokarumawak@apache.org>
This commit is contained in:
parent
56f7cd085f
commit
44d4b882bf
|
@ -72,8 +72,9 @@ import org.apache.nifi.stream.io.StreamUtils;
|
|||
+ "\"abc(def)?(g)\" we would add an attribute \"regex.1\" with a value of \"def\" if the \"def\" matched. If "
|
||||
+ "the \"def\" did not match, no attribute named \"regex.1\" would be added but an attribute named \"regex.2\" "
|
||||
+ "with a value of \"g\" will be added regardless."
|
||||
+ "The value of the property must be a valid Regular Expressions with one or more capturing groups. "
|
||||
+ "If the Regular Expression matches more than once, only the first match will be used. "
|
||||
+ "The value of the property must be a valid Regular Expressions with one or more capturing groups. "
|
||||
+ "If the Regular Expression matches more than once, only the first match will be used unless the property "
|
||||
+ "enabling repeating capture group is set to true. "
|
||||
+ "If any provided Regular Expression matches, the FlowFile(s) will be routed to 'matched'. "
|
||||
+ "If no provided Regular Expression matches, the FlowFile will be routed to 'unmatched' "
|
||||
+ "and no attributes will be applied to the FlowFile.")
|
||||
|
@ -193,6 +194,16 @@ public class ExtractText extends AbstractProcessor {
|
|||
.defaultValue("true")
|
||||
.build();
|
||||
|
||||
public static final PropertyDescriptor ENABLE_REPEATING_CAPTURE_GROUP = new PropertyDescriptor.Builder()
|
||||
.name("extract-text-enable-repeating-capture-group")
|
||||
.displayName("Enable repeating capture group")
|
||||
.description("If set to true, every string matching the capture groups will be extracted. Otherwise, "
|
||||
+ "if the Regular Expression matches more than once, only the first match will be extracted.")
|
||||
.required(true)
|
||||
.allowableValues("true", "false")
|
||||
.defaultValue("false")
|
||||
.build();
|
||||
|
||||
public static final Relationship REL_MATCH = new Relationship.Builder()
|
||||
.name("matched")
|
||||
.description("FlowFiles are routed to this relationship when the Regular Expression is successfully evaluated and the FlowFile is modified as a result")
|
||||
|
@ -229,6 +240,7 @@ public class ExtractText extends AbstractProcessor {
|
|||
props.add(UNICODE_CHARACTER_CLASS);
|
||||
props.add(UNIX_LINES);
|
||||
props.add(INCLUDE_CAPTURE_GROUP_ZERO);
|
||||
props.add(ENABLE_REPEATING_CAPTURE_GROUP);
|
||||
this.properties = Collections.unmodifiableList(props);
|
||||
}
|
||||
|
||||
|
@ -320,22 +332,28 @@ public class ExtractText extends AbstractProcessor {
|
|||
for (final Map.Entry<String, Pattern> entry : patternMap.entrySet()) {
|
||||
|
||||
final Matcher matcher = entry.getValue().matcher(contentString);
|
||||
int j = 0;
|
||||
|
||||
if (matcher.find()) {
|
||||
while (matcher.find()) {
|
||||
final String baseKey = entry.getKey();
|
||||
for (int i = startGroupIdx; i <= matcher.groupCount(); i++) {
|
||||
final String key = new StringBuilder(baseKey).append(".").append(i).toString();
|
||||
int start = j == 0 ? startGroupIdx : 1;
|
||||
for (int i = start; i <= matcher.groupCount(); i++) {
|
||||
final String key = new StringBuilder(baseKey).append(".").append(i+j).toString();
|
||||
String value = matcher.group(i);
|
||||
if (value != null) {
|
||||
if (value != null && !value.isEmpty()) {
|
||||
if (value.length() > maxCaptureGroupLength) {
|
||||
value = value.substring(0, maxCaptureGroupLength);
|
||||
}
|
||||
regexResults.put(key, value);
|
||||
if (i == 1) {
|
||||
if (i == 1 && j == 0) {
|
||||
regexResults.put(baseKey, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
j += matcher.groupCount();
|
||||
if(!context.getProperty(ENABLE_REPEATING_CAPTURE_GROUP).asBoolean()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -27,7 +27,6 @@ import org.apache.nifi.processor.Relationship;
|
|||
import org.apache.nifi.util.MockFlowFile;
|
||||
import org.apache.nifi.util.TestRunner;
|
||||
import org.apache.nifi.util.TestRunners;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestExtractText {
|
||||
|
@ -350,6 +349,59 @@ public class TestExtractText {
|
|||
out.assertAttributeEquals(attributeKey, SAMPLE_STRING);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFindAll() throws Exception {
|
||||
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
|
||||
testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
|
||||
final String attributeKey = "regex.result";
|
||||
testRunner.setProperty(attributeKey, "(?s)(\\w+)");
|
||||
testRunner.enqueue("This is my text".getBytes("UTF-8"));
|
||||
testRunner.run();
|
||||
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
|
||||
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
|
||||
// Ensure the zero capture group is in the resultant attributes
|
||||
out.assertAttributeExists(attributeKey + ".0");
|
||||
out.assertAttributeExists(attributeKey + ".1");
|
||||
out.assertAttributeExists(attributeKey + ".2");
|
||||
out.assertAttributeExists(attributeKey + ".3");
|
||||
out.assertAttributeExists(attributeKey + ".4");
|
||||
out.assertAttributeEquals(attributeKey, "This");
|
||||
out.assertAttributeEquals(attributeKey + ".0", "This");
|
||||
out.assertAttributeEquals(attributeKey + ".1", "This");
|
||||
out.assertAttributeEquals(attributeKey + ".2", "is");
|
||||
out.assertAttributeEquals(attributeKey + ".3", "my");
|
||||
out.assertAttributeEquals(attributeKey + ".4", "text");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFindAllPair() throws Exception {
|
||||
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
|
||||
testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
|
||||
final String attributeKey = "regex.result";
|
||||
testRunner.setProperty(attributeKey, "(\\w+)=(\\d+)");
|
||||
testRunner.enqueue("a=1,b=10,c=100".getBytes("UTF-8"));
|
||||
testRunner.run();
|
||||
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
|
||||
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
|
||||
// Ensure the zero capture group is in the resultant attributes
|
||||
out.assertAttributeExists(attributeKey + ".0");
|
||||
out.assertAttributeExists(attributeKey + ".1");
|
||||
out.assertAttributeExists(attributeKey + ".2");
|
||||
out.assertAttributeExists(attributeKey + ".3");
|
||||
out.assertAttributeExists(attributeKey + ".4");
|
||||
out.assertAttributeExists(attributeKey + ".5");
|
||||
out.assertAttributeExists(attributeKey + ".6");
|
||||
out.assertAttributeNotExists(attributeKey + ".7"); // Ensure there's no more attributes
|
||||
out.assertAttributeEquals(attributeKey, "a");
|
||||
out.assertAttributeEquals(attributeKey + ".0", "a=1");
|
||||
out.assertAttributeEquals(attributeKey + ".1", "a");
|
||||
out.assertAttributeEquals(attributeKey + ".2", "1");
|
||||
out.assertAttributeEquals(attributeKey + ".3", "b");
|
||||
out.assertAttributeEquals(attributeKey + ".4", "10");
|
||||
out.assertAttributeEquals(attributeKey + ".5", "c");
|
||||
out.assertAttributeEquals(attributeKey + ".6", "100");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIgnoreZeroCaptureGroupProperty() throws Exception {
|
||||
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
|
||||
|
|
Loading…
Reference in New Issue