SOLR-4864: RegexReplaceProcessorFactory should support pattern capture group substitution in replacement string.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1586093 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2014-04-09 19:31:01 +00:00
parent 0b728e5336
commit f58f711f73
4 changed files with 126 additions and 5 deletions

View File

@ -175,6 +175,10 @@ New Features
java -Durl="http://username:password@hostname:8983/solr/update" -jar post.jar sample.xml
(Sameer Maggon via Uwe Schindler)
* SOLR-4864: RegexReplaceProcessorFactory should support pattern capture group
substitution in replacement string.
(Sunil Srinivasan, Jack Krupansky via Steve Rowe)
Bug Fixes
----------------------

View File

@ -35,12 +35,22 @@ import org.slf4j.LoggerFactory;
/**
* An updated processor that applies a configured regex to any
* CharSequence values found in the selected fields, and replaces
* any matches with the configured replacement string
* any matches with the configured replacement string.
*
* <p>
* By default this processor applies itself to no fields.
* </p>
*
* <p>
* By default, <code>literalReplacement</code> is set to true, in which
* case, the <code>replacement</code> string will be treated literally by
* quoting via {@link Matcher#quoteReplacement(String)}. And hence, '\'
* and '$' signs will not be processed. When <code>literalReplacement</code>
* is set to false, one can perform backreference operations and capture
* group substitutions.
* </p>
*
* <p>
* For example, with the configuration listed below, any sequence of multiple
* whitespace characters found in values for field named <code>title</code>
* or <code>content</code> will be replaced by a single space character.
@ -52,8 +62,9 @@ import org.slf4j.LoggerFactory;
* &lt;str name="fieldName"&gt;title&lt;/str&gt;
* &lt;str name="pattern"&gt;\s+&lt;/str&gt;
* &lt;str name="replacement"&gt; &lt;/str&gt;
* &lt;bool name="literalReplacement"&gt;true&lt;/bool&gt;
* &lt;/processor&gt;</pre>
*
*
* @see java.util.regex.Pattern
*/
public final class RegexReplaceProcessorFactory extends FieldMutatingUpdateProcessorFactory {
@ -62,9 +73,12 @@ public final class RegexReplaceProcessorFactory extends FieldMutatingUpdateProce
private static final String REPLACEMENT_PARAM = "replacement";
private static final String PATTERN_PARAM = "pattern";
private static final String LITERAL_REPLACEMENT_PARAM = "literalReplacement";
private Pattern pattern;
private String replacement;
// by default, literalReplacementEnabled is set to true to allow backward compatibility
private boolean literalReplacementEnabled = true;
@SuppressWarnings("unchecked")
@Override
@ -72,7 +86,7 @@ public final class RegexReplaceProcessorFactory extends FieldMutatingUpdateProce
Object patternParam = args.remove(PATTERN_PARAM);
if(patternParam == null) {
if (patternParam == null) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Missing required init parameter: " + PATTERN_PARAM);
}
@ -85,11 +99,22 @@ public final class RegexReplaceProcessorFactory extends FieldMutatingUpdateProce
}
Object replacementParam = args.remove(REPLACEMENT_PARAM);
if(replacementParam == null) {
if (replacementParam == null) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Missing required init parameter: " + REPLACEMENT_PARAM);
}
replacement = Matcher.quoteReplacement(replacementParam.toString());
Boolean literalReplacement = args.removeBooleanArg(LITERAL_REPLACEMENT_PARAM);
if (literalReplacement != null) {
literalReplacementEnabled = literalReplacement;
}
if (literalReplacementEnabled) {
replacement = Matcher.quoteReplacement(replacementParam.toString());
} else {
replacement = replacementParam.toString();
}
super.init(args);
}

View File

@ -412,6 +412,45 @@
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="regex-replace-literal-true">
<processor class="solr.RegexReplaceProcessorFactory">
<str name="fieldName">content</str>
<str name="fieldName">title</str>
<str name="pattern">(try)</str>
<str name="replacement">&lt;$1&gt;</str>
<bool name="literalReplacement">true</bool>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="regex-replace-literal-default-true">
<processor class="solr.RegexReplaceProcessorFactory">
<str name="fieldName">content</str>
<str name="fieldName">title</str>
<str name="pattern">(try)</str>
<str name="replacement">&lt;$1&gt;</str>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="regex-replace-literal-false">
<processor class="solr.RegexReplaceProcessorFactory">
<str name="fieldName">content</str>
<str name="fieldName">title</str>
<str name="pattern">(try)</str>
<str name="replacement">&lt;$1&gt;</str>
<bool name="literalReplacement">false</bool>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="regex-replace-literal-str-true">
<processor class="solr.RegexReplaceProcessorFactory">
<str name="fieldName">content</str>
<str name="fieldName">title</str>
<str name="pattern">(try)</str>
<str name="replacement">&lt;$1&gt;</str>
<str name="literalReplacement">true</str>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="default-values">
<processor class="solr.DefaultValueUpdateProcessorFactory">
<str name="fieldName">processor_default_s</str>

View File

@ -367,6 +367,59 @@ public class FieldMutatingUpdateProcessorTest extends UpdateProcessorTestBase {
d.getFieldValue("content"));
assertEquals("ThisXtitleXhasXaXlotXofXspaces",
d.getFieldValue("title"));
// literalReplacement = true
d = processAdd("regex-replace-literal-true",
doc(f("id", "doc2"),
f("content", "Let's try this one"),
f("title", "Let's try try this one")));
assertNotNull(d);
assertEquals("Let's <$1> this one",
d.getFieldValue("content"));
assertEquals("Let's <$1> <$1> this one",
d.getFieldValue("title"));
// literalReplacement is not specified, defaults to true
d = processAdd("regex-replace-literal-default-true",
doc(f("id", "doc3"),
f("content", "Let's try this one"),
f("title", "Let's try try this one")));
assertNotNull(d);
assertEquals("Let's <$1> this one",
d.getFieldValue("content"));
assertEquals("Let's <$1> <$1> this one",
d.getFieldValue("title"));
// if user passes literalReplacement as a string param instead of boolean
d = processAdd("regex-replace-literal-str-true",
doc(f("id", "doc4"),
f("content", "Let's try this one"),
f("title", "Let's try try this one")));
assertNotNull(d);
assertEquals("Let's <$1> this one",
d.getFieldValue("content"));
assertEquals("Let's <$1> <$1> this one",
d.getFieldValue("title"));
// This is with literalReplacement = false
d = processAdd("regex-replace-literal-false",
doc(f("id", "doc5"),
f("content", "Let's try this one"),
f("title", "Let's try try this one")));
assertNotNull(d);
assertEquals("Let's <try> this one",
d.getFieldValue("content"));
assertEquals("Let's <try> <try> this one",
d.getFieldValue("title"));
}
public void testFirstValue() throws Exception {