mirror of https://github.com/apache/nifi.git
NIFI-808 Providing property to allow exclusion of capture group 0 for any regex expressions as an attribute.
This commit is contained in:
parent
f171756a88
commit
f044ba5d45
|
@ -170,13 +170,22 @@ public class ExtractText extends AbstractProcessor {
|
||||||
|
|
||||||
public static final PropertyDescriptor UNIX_LINES = new PropertyDescriptor.Builder()
|
public static final PropertyDescriptor UNIX_LINES = new PropertyDescriptor.Builder()
|
||||||
.name("Enable Unix Lines Mode")
|
.name("Enable Unix Lines Mode")
|
||||||
.description("Indicates that only the '\n' line terminator is recognized int the behavior of '.', '^', and '$'. Can also be specified "
|
.description("Indicates that only the '\n' line terminator is recognized in the behavior of '.', '^', and '$'. Can also be specified "
|
||||||
+ "via the embeded flag (?d).")
|
+ "via the embeded flag (?d).")
|
||||||
.required(true)
|
.required(true)
|
||||||
.allowableValues("true", "false")
|
.allowableValues("true", "false")
|
||||||
.defaultValue("false")
|
.defaultValue("false")
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor INCLUDE_CAPTURE_GROUP_ZERO = new PropertyDescriptor.Builder()
|
||||||
|
.name("Include Capture Group 0")
|
||||||
|
.description("Indicates that Capture Group 0 should be included as an attribute. Capture Group 0 represents the entirety of the regular expression match, is typically not used, and "
|
||||||
|
+ "could have considerable length.")
|
||||||
|
.required(true)
|
||||||
|
.allowableValues("true", "false")
|
||||||
|
.defaultValue("true")
|
||||||
|
.build();
|
||||||
|
|
||||||
public static final Relationship REL_MATCH = new Relationship.Builder()
|
public static final Relationship REL_MATCH = new Relationship.Builder()
|
||||||
.name("matched")
|
.name("matched")
|
||||||
.description("FlowFiles are routed to this relationship when the Regular Expression is successfully evaluated and the FlowFile is modified as a result")
|
.description("FlowFiles are routed to this relationship when the Regular Expression is successfully evaluated and the FlowFile is modified as a result")
|
||||||
|
@ -212,6 +221,7 @@ public class ExtractText extends AbstractProcessor {
|
||||||
props.add(UNICODE_CASE);
|
props.add(UNICODE_CASE);
|
||||||
props.add(UNICODE_CHARACTER_CLASS);
|
props.add(UNICODE_CHARACTER_CLASS);
|
||||||
props.add(UNIX_LINES);
|
props.add(UNIX_LINES);
|
||||||
|
props.add(INCLUDE_CAPTURE_GROUP_ZERO);
|
||||||
this.properties = Collections.unmodifiableList(props);
|
this.properties = Collections.unmodifiableList(props);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -297,13 +307,16 @@ public class ExtractText extends AbstractProcessor {
|
||||||
final Map<String, String> regexResults = new HashMap<>();
|
final Map<String, String> regexResults = new HashMap<>();
|
||||||
|
|
||||||
final Map<String, Pattern> patternMap = compiledPattersMapRef.get();
|
final Map<String, Pattern> patternMap = compiledPattersMapRef.get();
|
||||||
|
|
||||||
|
final int startGroupIdx = context.getProperty(INCLUDE_CAPTURE_GROUP_ZERO).asBoolean() ? 0 : 1;
|
||||||
|
|
||||||
for (final Map.Entry<String, Pattern> entry : patternMap.entrySet()) {
|
for (final Map.Entry<String, Pattern> entry : patternMap.entrySet()) {
|
||||||
|
|
||||||
final Matcher matcher = entry.getValue().matcher(contentString);
|
final Matcher matcher = entry.getValue().matcher(contentString);
|
||||||
|
|
||||||
if (matcher.find()) {
|
if (matcher.find()) {
|
||||||
final String baseKey = entry.getKey();
|
final String baseKey = entry.getKey();
|
||||||
for (int i = 0; i <= matcher.groupCount(); i++) {
|
for (int i = startGroupIdx; i <= matcher.groupCount(); i++) {
|
||||||
final String key = new StringBuilder(baseKey).append(".").append(i).toString();
|
final String key = new StringBuilder(baseKey).append(".").append(i).toString();
|
||||||
String value = matcher.group(i);
|
String value = matcher.group(i);
|
||||||
if (value.length() > maxCaptureGroupLength) {
|
if (value.length() > maxCaptureGroupLength) {
|
||||||
|
|
|
@ -310,4 +310,43 @@ public class TestExtractText {
|
||||||
assertEquals(2, relationships.size());
|
assertEquals(2, relationships.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testIncludeZeroCaptureGroupProperty() throws Exception {
|
||||||
|
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
|
||||||
|
|
||||||
|
final String attributeKey = "regex.result";
|
||||||
|
|
||||||
|
testRunner.setProperty(attributeKey, "(?s)(.*)");
|
||||||
|
|
||||||
|
testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
|
||||||
|
testRunner.run();
|
||||||
|
|
||||||
|
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
|
||||||
|
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
|
||||||
|
|
||||||
|
// Ensure the zero capture group is in the resultant attributes
|
||||||
|
out.assertAttributeExists(attributeKey + ".0");
|
||||||
|
out.assertAttributeEquals(attributeKey, SAMPLE_STRING);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testIgnoreZeroCaptureGroupProperty() throws Exception {
|
||||||
|
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
|
||||||
|
|
||||||
|
testRunner.setProperty(ExtractText.INCLUDE_CAPTURE_GROUP_ZERO, "false");
|
||||||
|
|
||||||
|
final String attributeKey = "regex.result";
|
||||||
|
|
||||||
|
testRunner.setProperty(attributeKey, "(?s)(.*)");
|
||||||
|
|
||||||
|
testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
|
||||||
|
testRunner.run();
|
||||||
|
|
||||||
|
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
|
||||||
|
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
|
||||||
|
|
||||||
|
// Ensure the zero capture group is not in the resultant attributes
|
||||||
|
out.assertAttributeNotExists(attributeKey + ".0");
|
||||||
|
out.assertAttributeEquals(attributeKey, SAMPLE_STRING);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue