SOLR-1653: add PatternReplaceCharFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@890798 13f79535-47bb-0310-9956-ffa450edef68
2009-12-15 14:13:59 +00:00 · 2009-12-15 14:13:59 +00:00 · 8b450730d3
parent 79b97f996b
commit 8b450730d3
5 changed files with 404 additions and 0 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -69,6 +69,8 @@ New Features

 * SOLR-1532: Allow StreamingUpdateSolrServer to use a provided HttpClient (Gabriele Renzi via shalin)

+* SOLR-1653: Add PatternReplaceCharFilter (koji)
+
 Optimizations
 ----------------------

--- a/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java
@ -43,4 +43,20 @@ public abstract class BaseCharFilterFactory implements CharFilterFactory {
    this.args = args;
  }

+  protected int getInt(String name) {
+    return getInt(name,-1,false);
+  }
+
+  protected int getInt(String name, int defaultVal) {
+    return getInt(name,defaultVal,true);
+  }
+
+  protected int getInt(String name, int defaultVal, boolean useDefault) {
+    String s = args.get(name);
+    if (s==null) {
+      if (useDefault) return defaultVal;
+      throw new RuntimeException("Configuration Error: missing parameter '" + name + "'");
+    }
+    return Integer.parseInt(s);
+  }
 }
--- a/src/java/org/apache/solr/analysis/PatternReplaceCharFilter.java
+++ b/src/java/org/apache/solr/analysis/PatternReplaceCharFilter.java
@ -0,0 +1,193 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.BaseCharFilter;
+import org.apache.lucene.analysis.CharStream;
+
+/**
+ * CharFilter that uses a regular expression for the target of replace string.
+ * The pattern match will be done in each "block" in char stream.
+ * 
+ * <p>
+ * ex1) source="aa&nbsp;&nbsp;bb&nbsp;aa&nbsp;bb", pattern="(aa)\\s+(bb)" replacement="$1#$2"<br/>
+ * output="aa#bb&nbsp;aa#bb"
+ * </p>
+ * 
+ * NOTE: If you produce a phrase that has different length to source string
+ * and the field is used for highlighting for a term of the phrase, you will
+ * face a trouble.
+ * 
+ * <p>
+ * ex2) source="aa123bb", pattern="(aa)\\d+(bb)" replacement="$1&nbsp;$2"<br/>
+ * output="aa&nbsp;bb"<br/>
+ * and you want to search bb and highlight it, you will get<br/>
+ * highlight snippet="aa1&lt;em&gt;23bb&lt;/em&gt;"
+ * </p>
+ * 
+ * @version $Id$
+ * @since Solr 1.5
+ */
+public class PatternReplaceCharFilter extends BaseCharFilter {
+
+  private final Pattern pattern;
+  private final String replacement;
+  private final int maxBlockChars;
+  private final String blockDelimiters;
+  public static final int DEFAULT_MAX_BLOCK_CHARS = 10000;
+
+  private LinkedList<Character> buffer;
+  private int nextCharCounter;
+  private char[] blockBuffer;
+  private int blockBufferLength;
+  private String replaceBlockBuffer;
+  private int replaceBlockBufferOffset;
+  
+  public PatternReplaceCharFilter( String pattern, String replacement, CharStream in ){
+    this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, null, in );
+  }
+
+  public PatternReplaceCharFilter( String pattern, String replacement,
+      int maxBlockChars, CharStream in ){
+    this( pattern, replacement, maxBlockChars, null, in );
+  }
+
+  public PatternReplaceCharFilter( String pattern, String replacement,
+      String blockDelimiters, CharStream in ){
+    this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, blockDelimiters, in );
+  }
+
+  public PatternReplaceCharFilter( String pattern, String replacement,
+      int maxBlockChars, String blockDelimiters, CharStream in ){
+    super( in );
+    this.pattern = Pattern.compile( pattern );
+    this.replacement = replacement;
+    if( maxBlockChars < 1 )
+      throw new IllegalArgumentException( "maxBlockChars should be greater than 0, but it is " + maxBlockChars );
+    this.maxBlockChars = maxBlockChars;
+    this.blockDelimiters = blockDelimiters;
+    blockBuffer = new char[maxBlockChars];
+  }
+  
+  private boolean prepareReplaceBlock() throws IOException {
+    while( true ){
+      if( replaceBlockBuffer != null && replaceBlockBuffer.length() > replaceBlockBufferOffset )
+        return true;
+      // prepare block buffer
+      blockBufferLength = 0;
+      while( true ){
+        int c = nextChar();
+        if( c == -1 ) break;
+        blockBuffer[blockBufferLength++] = (char)c;
+        // end of block?
+        boolean foundDelimiter =
+          ( blockDelimiters != null ) &&
+          ( blockDelimiters.length() > 0 ) &&
+          blockDelimiters.indexOf( c ) >= 0;
+        if( foundDelimiter ||
+            blockBufferLength >= maxBlockChars ) break;
+      }
+      // block buffer available?
+      if( blockBufferLength == 0 ) return false;
+      replaceBlockBuffer = getReplaceBlock( blockBuffer, 0, blockBufferLength );
+      replaceBlockBufferOffset = 0;
+    }
+  }
+
+  public int read() throws IOException {
+    while( prepareReplaceBlock() ){
+      return replaceBlockBuffer.charAt( replaceBlockBufferOffset++ );
+    }
+    return -1;
+  }
+
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    char[] tmp = new char[len];
+    int l = input.read(tmp, 0, len);
+    if (l != -1) {
+      for(int i = 0; i < l; i++)
+        pushLastChar(tmp[i]);
+    }
+    l = 0;
+    for(int i = off; i < off + len; i++) {
+      int c = read();
+      if (c == -1) break;
+      cbuf[i] = (char) c;
+      l++;
+    }
+    return l == 0 ? -1 : l;
+  }
+
+  private int nextChar() throws IOException {
+    if (buffer != null && !buffer.isEmpty()) {
+      nextCharCounter++;
+      return buffer.removeFirst().charValue();
+    }
+    int c = input.read();
+    if( c != -1 )
+      nextCharCounter++;
+    return c;
+  }
+
+  private void pushLastChar(int c) {
+    if (buffer == null) {
+      buffer = new LinkedList<Character>();
+    }
+    buffer.addLast(new Character((char) c));
+  }
+  
+  String getReplaceBlock( String block ){
+    char[] blockChars = block.toCharArray();
+    return getReplaceBlock( blockChars, 0, blockChars.length );
+  }
+    
+  String getReplaceBlock( char block[], int offset, int length ){
+    StringBuffer replaceBlock = new StringBuffer();
+    String sourceBlock = new String( block, offset, length );
+    Matcher m = pattern.matcher( sourceBlock );
+    int lastMatchOffset = 0, lastDiff = 0;
+    while( m.find() ){
+      m.appendReplacement( replaceBlock, replacement );
+      // record cumulative diff for the offset correction
+      int diff = replaceBlock.length() - lastMatchOffset - lastDiff - ( m.end( 0 ) - lastMatchOffset );
+      if (diff != 0) {
+        int prevCumulativeDiff = getLastCumulativeDiff();
+        if (diff > 0) {
+          for(int i = 0; i < diff; i++){
+            addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + i - prevCumulativeDiff,
+                prevCumulativeDiff - 1 - i);
+          }
+        } else {
+          addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + diff - prevCumulativeDiff,
+              prevCumulativeDiff - diff);
+        }
+      }
+      // save last offsets
+      lastMatchOffset = m.end( 0 );
+      lastDiff = diff;
+    }
+    // copy remaining of the part of source block
+    m.appendTail( replaceBlock );
+    return replaceBlock.toString();
+  }
+}
--- a/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java
@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharStream;
+
+/**
+ * 
+ * @version $Id$
+ * @since Solr 1.5
+ */
+public class PatternReplaceCharFilterFactory extends BaseCharFilterFactory {
+  
+  private String pattern;
+  private String replacement;
+  private int maxBlockChars;
+  private String blockDelimiters;
+
+  public void init(Map<String, String> args) {
+    super.init( args );
+    pattern = args.get( "pattern" );
+    if( pattern == null )
+      pattern = "";
+    replacement = args.get( "replacement" );
+    if( replacement == null )
+      replacement = "";
+    maxBlockChars = getInt( "maxBlockChars", PatternReplaceCharFilter.DEFAULT_MAX_BLOCK_CHARS );
+    blockDelimiters = args.get( "blockDelimiters" );
+  }
+
+  public CharStream create(CharStream input) {
+    return new PatternReplaceCharFilter( pattern, replacement, maxBlockChars, blockDelimiters, input );
+  }
+}
--- a/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java
+++ b/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java
@ -0,0 +1,142 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+/**
+ * 
+ * @version $Id$
+ *
+ */
+public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
+  
+  //           1111
+  // 01234567890123
+  // this is test.
+  public void testNothingChange() throws IOException {
+    final String BLOCK = "this is test.";
+    CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1$2$3",
+          CharReader.get( new StringReader( BLOCK ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    assertTokEqualOff( tokens( "this,1,0,4 is,1,5,7 test.,1,8,13" ), getTokens( ts ) );
+  }
+  
+  // 012345678
+  // aa bb cc
+  public void testReplaceByEmpty() throws IOException {
+    final String BLOCK = "aa bb cc";
+    CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "",
+          CharReader.get( new StringReader( BLOCK ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    assertEquals( 0, getTokens( ts ).size() );
+  }
+  
+  // 012345678
+  // aa bb cc
+  // aa#bb#cc
+  public void test1block1matchSameLength() throws IOException {
+    final String BLOCK = "aa bb cc";
+    CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2#$3",
+          CharReader.get( new StringReader( BLOCK ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    assertTokEqualOff( tokens( "aa#bb#cc,1,0,8" ), getTokens( ts ) );
+  }
+
+  //           11111
+  // 012345678901234
+  // aa bb cc dd
+  // aa##bb###cc dd
+  public void test1block1matchLonger() throws IOException {
+    final String BLOCK = "aa bb cc dd";
+    CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1##$2###$3",
+          CharReader.get( new StringReader( BLOCK ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    assertTokEqualOff( tokens( "aa##bb###cc,1,0,8 dd,1,9,11" ), getTokens( ts ) );
+  }
+
+  // 01234567
+  //  a  a
+  //  aa  aa
+  public void test1block2matchLonger() throws IOException {
+    final String BLOCK = " a  a";
+    CharStream cs = new PatternReplaceCharFilter( "a", "aa",
+          CharReader.get( new StringReader( BLOCK ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    assertTokEqualOff( tokens( "aa,1,1,2 aa,1,4,5" ), getTokens( ts ) );
+  }
+
+  //           11111
+  // 012345678901234
+  // aa  bb   cc dd
+  // aa#bb dd
+  public void test1block1matchShorter() throws IOException {
+    final String BLOCK = "aa  bb   cc dd";
+    CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2",
+          CharReader.get( new StringReader( BLOCK ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    assertTokEqualOff( tokens( "aa#bb,1,0,11 dd,1,12,14" ), getTokens( ts ) );
+  }
+
+  //           111111111122222222223333
+  // 0123456789012345678901234567890123
+  //   aa bb cc --- aa bb aa   bb   cc
+  //   aa  bb  cc --- aa bb aa  bb  cc
+  public void test1blockMultiMatches() throws IOException {
+    final String BLOCK = "  aa bb cc --- aa bb aa   bb   cc";
+    CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1  $2  $3",
+          CharReader.get( new StringReader( BLOCK ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    assertTokEqualOff( tokens( "aa,1,2,4 bb,1,6,8 cc,1,9,10 ---,1,11,14 aa,1,15,17 bb,1,18,20 aa,1,21,23 bb,1,25,27 cc,1,29,33" ),
+        getTokens( ts ) );
+  }
+
+  //           11111111112222222222333333333
+  // 012345678901234567890123456789012345678
+  //   aa bb cc --- aa bb aa. bb aa   bb cc
+  //   aa##bb cc --- aa##bb aa. bb aa##bb cc
+  public void test2blocksMultiMatches() throws IOException {
+    final String BLOCK = "  aa bb cc --- aa bb aa. bb aa   bb cc";
+    CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)", "$1##$2", ".",
+          CharReader.get( new StringReader( BLOCK ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    assertTokEqualOff( tokens( "aa##bb,1,2,7 cc,1,8,10 ---,1,11,14 aa##bb,1,15,20 aa.,1,21,24 bb,1,25,27 aa##bb,1,28,35 cc,1,36,38" ),
+        getTokens( ts ) );
+  }
+
+  //           11111111112222222222333333333
+  // 012345678901234567890123456789012345678
+  //  a bb - ccc . --- bb a . ccc ccc bb
+  //  aa b - c . --- b aa . c c b
+  public void testChain() throws IOException {
+    final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb";
+    CharStream cs = new PatternReplaceCharFilter( "a", "aa", ".",
+        CharReader.get( new StringReader( BLOCK ) ) );
+    cs = new PatternReplaceCharFilter( "bb", "b", ".", cs );
+    cs = new PatternReplaceCharFilter( "ccc", "c", ".", cs );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    assertTokEqualOff( tokens( "aa,1,1,2 b,1,3,5 -,1,6,7 c,1,8,11 .,1,12,13 ---,1,14,17 b,1,18,20 aa,1,21,22 .,1,23,24 c,1,25,28 c,1,29,32 b,1,33,35" ),
+        getTokens( ts ) );
+  }
+}