diff --git a/CHANGES.txt b/CHANGES.txt
index 94d45959c8b..8e09f30cb39 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -69,6 +69,8 @@ New Features
* SOLR-1532: Allow StreamingUpdateSolrServer to use a provided HttpClient (Gabriele Renzi via shalin)
+* SOLR-1653: Add PatternReplaceCharFilter (koji)
+
Optimizations
----------------------
diff --git a/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java b/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java
index 417e601124e..62ff65dd0ef 100644
--- a/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java
@@ -43,4 +43,20 @@ public abstract class BaseCharFilterFactory implements CharFilterFactory {
this.args = args;
}
+ protected int getInt(String name) {
+ return getInt(name,-1,false);
+ }
+
+ protected int getInt(String name, int defaultVal) {
+ return getInt(name,defaultVal,true);
+ }
+
+ protected int getInt(String name, int defaultVal, boolean useDefault) {
+ String s = args.get(name);
+ if (s==null) {
+ if (useDefault) return defaultVal;
+ throw new RuntimeException("Configuration Error: missing parameter '" + name + "'");
+ }
+ return Integer.parseInt(s);
+ }
}
diff --git a/src/java/org/apache/solr/analysis/PatternReplaceCharFilter.java b/src/java/org/apache/solr/analysis/PatternReplaceCharFilter.java
new file mode 100644
index 00000000000..adc24636a0e
--- /dev/null
+++ b/src/java/org/apache/solr/analysis/PatternReplaceCharFilter.java
@@ -0,0 +1,193 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.BaseCharFilter;
+import org.apache.lucene.analysis.CharStream;
+
+/**
+ * CharFilter that uses a regular expression for the target of replace string.
+ * The pattern match will be done in each "block" in char stream.
+ *
+ *
+ * ex1) source="aa bb aa bb", pattern="(aa)\\s+(bb)" replacement="$1#$2"
+ * output="aa#bb aa#bb"
+ *
+ *
+ * NOTE: If you produce a phrase that has different length to source string
+ * and the field is used for highlighting for a term of the phrase, you will
+ * face a trouble.
+ *
+ *
+ * ex2) source="aa123bb", pattern="(aa)\\d+(bb)" replacement="$1 $2"
+ * output="aa bb"
+ * and you want to search bb and highlight it, you will get
+ * highlight snippet="aa1<em>23bb</em>"
+ *
+ *
+ * @version $Id$
+ * @since Solr 1.5
+ */
+public class PatternReplaceCharFilter extends BaseCharFilter {
+
+ private final Pattern pattern;
+ private final String replacement;
+ private final int maxBlockChars;
+ private final String blockDelimiters;
+ public static final int DEFAULT_MAX_BLOCK_CHARS = 10000;
+
+ private LinkedList buffer;
+ private int nextCharCounter;
+ private char[] blockBuffer;
+ private int blockBufferLength;
+ private String replaceBlockBuffer;
+ private int replaceBlockBufferOffset;
+
+ public PatternReplaceCharFilter( String pattern, String replacement, CharStream in ){
+ this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, null, in );
+ }
+
+ public PatternReplaceCharFilter( String pattern, String replacement,
+ int maxBlockChars, CharStream in ){
+ this( pattern, replacement, maxBlockChars, null, in );
+ }
+
+ public PatternReplaceCharFilter( String pattern, String replacement,
+ String blockDelimiters, CharStream in ){
+ this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, blockDelimiters, in );
+ }
+
+ public PatternReplaceCharFilter( String pattern, String replacement,
+ int maxBlockChars, String blockDelimiters, CharStream in ){
+ super( in );
+ this.pattern = Pattern.compile( pattern );
+ this.replacement = replacement;
+ if( maxBlockChars < 1 )
+ throw new IllegalArgumentException( "maxBlockChars should be greater than 0, but it is " + maxBlockChars );
+ this.maxBlockChars = maxBlockChars;
+ this.blockDelimiters = blockDelimiters;
+ blockBuffer = new char[maxBlockChars];
+ }
+
+ private boolean prepareReplaceBlock() throws IOException {
+ while( true ){
+ if( replaceBlockBuffer != null && replaceBlockBuffer.length() > replaceBlockBufferOffset )
+ return true;
+ // prepare block buffer
+ blockBufferLength = 0;
+ while( true ){
+ int c = nextChar();
+ if( c == -1 ) break;
+ blockBuffer[blockBufferLength++] = (char)c;
+ // end of block?
+ boolean foundDelimiter =
+ ( blockDelimiters != null ) &&
+ ( blockDelimiters.length() > 0 ) &&
+ blockDelimiters.indexOf( c ) >= 0;
+ if( foundDelimiter ||
+ blockBufferLength >= maxBlockChars ) break;
+ }
+ // block buffer available?
+ if( blockBufferLength == 0 ) return false;
+ replaceBlockBuffer = getReplaceBlock( blockBuffer, 0, blockBufferLength );
+ replaceBlockBufferOffset = 0;
+ }
+ }
+
+ public int read() throws IOException {
+ while( prepareReplaceBlock() ){
+ return replaceBlockBuffer.charAt( replaceBlockBufferOffset++ );
+ }
+ return -1;
+ }
+
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ char[] tmp = new char[len];
+ int l = input.read(tmp, 0, len);
+ if (l != -1) {
+ for(int i = 0; i < l; i++)
+ pushLastChar(tmp[i]);
+ }
+ l = 0;
+ for(int i = off; i < off + len; i++) {
+ int c = read();
+ if (c == -1) break;
+ cbuf[i] = (char) c;
+ l++;
+ }
+ return l == 0 ? -1 : l;
+ }
+
+ private int nextChar() throws IOException {
+ if (buffer != null && !buffer.isEmpty()) {
+ nextCharCounter++;
+ return buffer.removeFirst().charValue();
+ }
+ int c = input.read();
+ if( c != -1 )
+ nextCharCounter++;
+ return c;
+ }
+
+ private void pushLastChar(int c) {
+ if (buffer == null) {
+ buffer = new LinkedList();
+ }
+ buffer.addLast(new Character((char) c));
+ }
+
+ String getReplaceBlock( String block ){
+ char[] blockChars = block.toCharArray();
+ return getReplaceBlock( blockChars, 0, blockChars.length );
+ }
+
+ String getReplaceBlock( char block[], int offset, int length ){
+ StringBuffer replaceBlock = new StringBuffer();
+ String sourceBlock = new String( block, offset, length );
+ Matcher m = pattern.matcher( sourceBlock );
+ int lastMatchOffset = 0, lastDiff = 0;
+ while( m.find() ){
+ m.appendReplacement( replaceBlock, replacement );
+ // record cumulative diff for the offset correction
+ int diff = replaceBlock.length() - lastMatchOffset - lastDiff - ( m.end( 0 ) - lastMatchOffset );
+ if (diff != 0) {
+ int prevCumulativeDiff = getLastCumulativeDiff();
+ if (diff > 0) {
+ for(int i = 0; i < diff; i++){
+ addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + i - prevCumulativeDiff,
+ prevCumulativeDiff - 1 - i);
+ }
+ } else {
+ addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + diff - prevCumulativeDiff,
+ prevCumulativeDiff - diff);
+ }
+ }
+ // save last offsets
+ lastMatchOffset = m.end( 0 );
+ lastDiff = diff;
+ }
+ // copy remaining of the part of source block
+ m.appendTail( replaceBlock );
+ return replaceBlock.toString();
+ }
+}
diff --git a/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java b/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java
new file mode 100644
index 00000000000..958279b7452
--- /dev/null
+++ b/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharStream;
+
+/**
+ *
+ * @version $Id$
+ * @since Solr 1.5
+ */
+public class PatternReplaceCharFilterFactory extends BaseCharFilterFactory {
+
+ private String pattern;
+ private String replacement;
+ private int maxBlockChars;
+ private String blockDelimiters;
+
+ public void init(Map args) {
+ super.init( args );
+ pattern = args.get( "pattern" );
+ if( pattern == null )
+ pattern = "";
+ replacement = args.get( "replacement" );
+ if( replacement == null )
+ replacement = "";
+ maxBlockChars = getInt( "maxBlockChars", PatternReplaceCharFilter.DEFAULT_MAX_BLOCK_CHARS );
+ blockDelimiters = args.get( "blockDelimiters" );
+ }
+
+ public CharStream create(CharStream input) {
+ return new PatternReplaceCharFilter( pattern, replacement, maxBlockChars, blockDelimiters, input );
+ }
+}
diff --git a/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java b/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java
new file mode 100644
index 00000000000..02c22589f42
--- /dev/null
+++ b/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java
@@ -0,0 +1,142 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+/**
+ *
+ * @version $Id$
+ *
+ */
+public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
+
+ // 1111
+ // 01234567890123
+ // this is test.
+ public void testNothingChange() throws IOException {
+ final String BLOCK = "this is test.";
+ CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1$2$3",
+ CharReader.get( new StringReader( BLOCK ) ) );
+ TokenStream ts = new WhitespaceTokenizer( cs );
+ assertTokEqualOff( tokens( "this,1,0,4 is,1,5,7 test.,1,8,13" ), getTokens( ts ) );
+ }
+
+ // 012345678
+ // aa bb cc
+ public void testReplaceByEmpty() throws IOException {
+ final String BLOCK = "aa bb cc";
+ CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "",
+ CharReader.get( new StringReader( BLOCK ) ) );
+ TokenStream ts = new WhitespaceTokenizer( cs );
+ assertEquals( 0, getTokens( ts ).size() );
+ }
+
+ // 012345678
+ // aa bb cc
+ // aa#bb#cc
+ public void test1block1matchSameLength() throws IOException {
+ final String BLOCK = "aa bb cc";
+ CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2#$3",
+ CharReader.get( new StringReader( BLOCK ) ) );
+ TokenStream ts = new WhitespaceTokenizer( cs );
+ assertTokEqualOff( tokens( "aa#bb#cc,1,0,8" ), getTokens( ts ) );
+ }
+
+ // 11111
+ // 012345678901234
+ // aa bb cc dd
+ // aa##bb###cc dd
+ public void test1block1matchLonger() throws IOException {
+ final String BLOCK = "aa bb cc dd";
+ CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1##$2###$3",
+ CharReader.get( new StringReader( BLOCK ) ) );
+ TokenStream ts = new WhitespaceTokenizer( cs );
+ assertTokEqualOff( tokens( "aa##bb###cc,1,0,8 dd,1,9,11" ), getTokens( ts ) );
+ }
+
+ // 01234567
+ // a a
+ // aa aa
+ public void test1block2matchLonger() throws IOException {
+ final String BLOCK = " a a";
+ CharStream cs = new PatternReplaceCharFilter( "a", "aa",
+ CharReader.get( new StringReader( BLOCK ) ) );
+ TokenStream ts = new WhitespaceTokenizer( cs );
+ assertTokEqualOff( tokens( "aa,1,1,2 aa,1,4,5" ), getTokens( ts ) );
+ }
+
+ // 11111
+ // 012345678901234
+ // aa bb cc dd
+ // aa#bb dd
+ public void test1block1matchShorter() throws IOException {
+ final String BLOCK = "aa bb cc dd";
+ CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2",
+ CharReader.get( new StringReader( BLOCK ) ) );
+ TokenStream ts = new WhitespaceTokenizer( cs );
+ assertTokEqualOff( tokens( "aa#bb,1,0,11 dd,1,12,14" ), getTokens( ts ) );
+ }
+
+ // 111111111122222222223333
+ // 0123456789012345678901234567890123
+ // aa bb cc --- aa bb aa bb cc
+ // aa bb cc --- aa bb aa bb cc
+ public void test1blockMultiMatches() throws IOException {
+ final String BLOCK = " aa bb cc --- aa bb aa bb cc";
+ CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1 $2 $3",
+ CharReader.get( new StringReader( BLOCK ) ) );
+ TokenStream ts = new WhitespaceTokenizer( cs );
+ assertTokEqualOff( tokens( "aa,1,2,4 bb,1,6,8 cc,1,9,10 ---,1,11,14 aa,1,15,17 bb,1,18,20 aa,1,21,23 bb,1,25,27 cc,1,29,33" ),
+ getTokens( ts ) );
+ }
+
+ // 11111111112222222222333333333
+ // 012345678901234567890123456789012345678
+ // aa bb cc --- aa bb aa. bb aa bb cc
+ // aa##bb cc --- aa##bb aa. bb aa##bb cc
+ public void test2blocksMultiMatches() throws IOException {
+ final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc";
+ CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)", "$1##$2", ".",
+ CharReader.get( new StringReader( BLOCK ) ) );
+ TokenStream ts = new WhitespaceTokenizer( cs );
+ assertTokEqualOff( tokens( "aa##bb,1,2,7 cc,1,8,10 ---,1,11,14 aa##bb,1,15,20 aa.,1,21,24 bb,1,25,27 aa##bb,1,28,35 cc,1,36,38" ),
+ getTokens( ts ) );
+ }
+
+ // 11111111112222222222333333333
+ // 012345678901234567890123456789012345678
+ // a bb - ccc . --- bb a . ccc ccc bb
+ // aa b - c . --- b aa . c c b
+ public void testChain() throws IOException {
+ final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb";
+ CharStream cs = new PatternReplaceCharFilter( "a", "aa", ".",
+ CharReader.get( new StringReader( BLOCK ) ) );
+ cs = new PatternReplaceCharFilter( "bb", "b", ".", cs );
+ cs = new PatternReplaceCharFilter( "ccc", "c", ".", cs );
+ TokenStream ts = new WhitespaceTokenizer( cs );
+ assertTokEqualOff( tokens( "aa,1,1,2 b,1,3,5 -,1,6,7 c,1,8,11 .,1,12,13 ---,1,14,17 b,1,18,20 aa,1,21,22 .,1,23,24 c,1,25,28 c,1,29,32 b,1,33,35" ),
+ getTokens( ts ) );
+ }
+}