From 8b450730d3e1ff646e28ee1020b5e7b0df81c6ab Mon Sep 17 00:00:00 2001 From: Koji Sekiguchi Date: Tue, 15 Dec 2009 14:13:59 +0000 Subject: [PATCH] SOLR-1653: add PatternReplaceCharFilter git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@890798 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 2 + .../solr/analysis/BaseCharFilterFactory.java | 16 ++ .../analysis/PatternReplaceCharFilter.java | 193 ++++++++++++++++++ .../PatternReplaceCharFilterFactory.java | 51 +++++ .../TestPatternReplaceCharFilter.java | 142 +++++++++++++ 5 files changed, 404 insertions(+) create mode 100644 src/java/org/apache/solr/analysis/PatternReplaceCharFilter.java create mode 100644 src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java create mode 100644 src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java diff --git a/CHANGES.txt b/CHANGES.txt index 94d45959c8b..8e09f30cb39 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -69,6 +69,8 @@ New Features * SOLR-1532: Allow StreamingUpdateSolrServer to use a provided HttpClient (Gabriele Renzi via shalin) +* SOLR-1653: Add PatternReplaceCharFilter (koji) + Optimizations ---------------------- diff --git a/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java b/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java index 417e601124e..62ff65dd0ef 100644 --- a/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java +++ b/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java @@ -43,4 +43,20 @@ public abstract class BaseCharFilterFactory implements CharFilterFactory { this.args = args; } + protected int getInt(String name) { + return getInt(name,-1,false); + } + + protected int getInt(String name, int defaultVal) { + return getInt(name,defaultVal,true); + } + + protected int getInt(String name, int defaultVal, boolean useDefault) { + String s = args.get(name); + if (s==null) { + if (useDefault) return defaultVal; + throw new RuntimeException("Configuration Error: missing parameter '" + name + "'"); + } + return Integer.parseInt(s); + } } diff --git a/src/java/org/apache/solr/analysis/PatternReplaceCharFilter.java b/src/java/org/apache/solr/analysis/PatternReplaceCharFilter.java new file mode 100644 index 00000000000..adc24636a0e --- /dev/null +++ b/src/java/org/apache/solr/analysis/PatternReplaceCharFilter.java @@ -0,0 +1,193 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.BaseCharFilter; +import org.apache.lucene.analysis.CharStream; + +/** + * CharFilter that uses a regular expression for the target of replace string. + * The pattern match will be done in each "block" in char stream. + * + *

+ * ex1) source="aa  bb aa bb", pattern="(aa)\\s+(bb)" replacement="$1#$2"
+ * output="aa#bb aa#bb" + *

+ * + * NOTE: If you produce a phrase that has different length to source string + * and the field is used for highlighting for a term of the phrase, you will + * face a trouble. + * + *

+ * ex2) source="aa123bb", pattern="(aa)\\d+(bb)" replacement="$1 $2"
+ * output="aa bb"
+ * and you want to search bb and highlight it, you will get
+ * highlight snippet="aa1<em>23bb</em>" + *

+ * + * @version $Id$ + * @since Solr 1.5 + */ +public class PatternReplaceCharFilter extends BaseCharFilter { + + private final Pattern pattern; + private final String replacement; + private final int maxBlockChars; + private final String blockDelimiters; + public static final int DEFAULT_MAX_BLOCK_CHARS = 10000; + + private LinkedList buffer; + private int nextCharCounter; + private char[] blockBuffer; + private int blockBufferLength; + private String replaceBlockBuffer; + private int replaceBlockBufferOffset; + + public PatternReplaceCharFilter( String pattern, String replacement, CharStream in ){ + this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, null, in ); + } + + public PatternReplaceCharFilter( String pattern, String replacement, + int maxBlockChars, CharStream in ){ + this( pattern, replacement, maxBlockChars, null, in ); + } + + public PatternReplaceCharFilter( String pattern, String replacement, + String blockDelimiters, CharStream in ){ + this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, blockDelimiters, in ); + } + + public PatternReplaceCharFilter( String pattern, String replacement, + int maxBlockChars, String blockDelimiters, CharStream in ){ + super( in ); + this.pattern = Pattern.compile( pattern ); + this.replacement = replacement; + if( maxBlockChars < 1 ) + throw new IllegalArgumentException( "maxBlockChars should be greater than 0, but it is " + maxBlockChars ); + this.maxBlockChars = maxBlockChars; + this.blockDelimiters = blockDelimiters; + blockBuffer = new char[maxBlockChars]; + } + + private boolean prepareReplaceBlock() throws IOException { + while( true ){ + if( replaceBlockBuffer != null && replaceBlockBuffer.length() > replaceBlockBufferOffset ) + return true; + // prepare block buffer + blockBufferLength = 0; + while( true ){ + int c = nextChar(); + if( c == -1 ) break; + blockBuffer[blockBufferLength++] = (char)c; + // end of block? + boolean foundDelimiter = + ( blockDelimiters != null ) && + ( blockDelimiters.length() > 0 ) && + blockDelimiters.indexOf( c ) >= 0; + if( foundDelimiter || + blockBufferLength >= maxBlockChars ) break; + } + // block buffer available? + if( blockBufferLength == 0 ) return false; + replaceBlockBuffer = getReplaceBlock( blockBuffer, 0, blockBufferLength ); + replaceBlockBufferOffset = 0; + } + } + + public int read() throws IOException { + while( prepareReplaceBlock() ){ + return replaceBlockBuffer.charAt( replaceBlockBufferOffset++ ); + } + return -1; + } + + public int read(char[] cbuf, int off, int len) throws IOException { + char[] tmp = new char[len]; + int l = input.read(tmp, 0, len); + if (l != -1) { + for(int i = 0; i < l; i++) + pushLastChar(tmp[i]); + } + l = 0; + for(int i = off; i < off + len; i++) { + int c = read(); + if (c == -1) break; + cbuf[i] = (char) c; + l++; + } + return l == 0 ? -1 : l; + } + + private int nextChar() throws IOException { + if (buffer != null && !buffer.isEmpty()) { + nextCharCounter++; + return buffer.removeFirst().charValue(); + } + int c = input.read(); + if( c != -1 ) + nextCharCounter++; + return c; + } + + private void pushLastChar(int c) { + if (buffer == null) { + buffer = new LinkedList(); + } + buffer.addLast(new Character((char) c)); + } + + String getReplaceBlock( String block ){ + char[] blockChars = block.toCharArray(); + return getReplaceBlock( blockChars, 0, blockChars.length ); + } + + String getReplaceBlock( char block[], int offset, int length ){ + StringBuffer replaceBlock = new StringBuffer(); + String sourceBlock = new String( block, offset, length ); + Matcher m = pattern.matcher( sourceBlock ); + int lastMatchOffset = 0, lastDiff = 0; + while( m.find() ){ + m.appendReplacement( replaceBlock, replacement ); + // record cumulative diff for the offset correction + int diff = replaceBlock.length() - lastMatchOffset - lastDiff - ( m.end( 0 ) - lastMatchOffset ); + if (diff != 0) { + int prevCumulativeDiff = getLastCumulativeDiff(); + if (diff > 0) { + for(int i = 0; i < diff; i++){ + addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + i - prevCumulativeDiff, + prevCumulativeDiff - 1 - i); + } + } else { + addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + diff - prevCumulativeDiff, + prevCumulativeDiff - diff); + } + } + // save last offsets + lastMatchOffset = m.end( 0 ); + lastDiff = diff; + } + // copy remaining of the part of source block + m.appendTail( replaceBlock ); + return replaceBlock.toString(); + } +} diff --git a/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java b/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java new file mode 100644 index 00000000000..958279b7452 --- /dev/null +++ b/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import java.util.Map; + +import org.apache.lucene.analysis.CharStream; + +/** + * + * @version $Id$ + * @since Solr 1.5 + */ +public class PatternReplaceCharFilterFactory extends BaseCharFilterFactory { + + private String pattern; + private String replacement; + private int maxBlockChars; + private String blockDelimiters; + + public void init(Map args) { + super.init( args ); + pattern = args.get( "pattern" ); + if( pattern == null ) + pattern = ""; + replacement = args.get( "replacement" ); + if( replacement == null ) + replacement = ""; + maxBlockChars = getInt( "maxBlockChars", PatternReplaceCharFilter.DEFAULT_MAX_BLOCK_CHARS ); + blockDelimiters = args.get( "blockDelimiters" ); + } + + public CharStream create(CharStream input) { + return new PatternReplaceCharFilter( pattern, replacement, maxBlockChars, blockDelimiters, input ); + } +} diff --git a/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java b/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java new file mode 100644 index 00000000000..02c22589f42 --- /dev/null +++ b/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java @@ -0,0 +1,142 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.analysis.CharReader; +import org.apache.lucene.analysis.CharStream; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +/** + * + * @version $Id$ + * + */ +public class TestPatternReplaceCharFilter extends BaseTokenTestCase { + + // 1111 + // 01234567890123 + // this is test. + public void testNothingChange() throws IOException { + final String BLOCK = "this is test."; + CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1$2$3", + CharReader.get( new StringReader( BLOCK ) ) ); + TokenStream ts = new WhitespaceTokenizer( cs ); + assertTokEqualOff( tokens( "this,1,0,4 is,1,5,7 test.,1,8,13" ), getTokens( ts ) ); + } + + // 012345678 + // aa bb cc + public void testReplaceByEmpty() throws IOException { + final String BLOCK = "aa bb cc"; + CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "", + CharReader.get( new StringReader( BLOCK ) ) ); + TokenStream ts = new WhitespaceTokenizer( cs ); + assertEquals( 0, getTokens( ts ).size() ); + } + + // 012345678 + // aa bb cc + // aa#bb#cc + public void test1block1matchSameLength() throws IOException { + final String BLOCK = "aa bb cc"; + CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2#$3", + CharReader.get( new StringReader( BLOCK ) ) ); + TokenStream ts = new WhitespaceTokenizer( cs ); + assertTokEqualOff( tokens( "aa#bb#cc,1,0,8" ), getTokens( ts ) ); + } + + // 11111 + // 012345678901234 + // aa bb cc dd + // aa##bb###cc dd + public void test1block1matchLonger() throws IOException { + final String BLOCK = "aa bb cc dd"; + CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1##$2###$3", + CharReader.get( new StringReader( BLOCK ) ) ); + TokenStream ts = new WhitespaceTokenizer( cs ); + assertTokEqualOff( tokens( "aa##bb###cc,1,0,8 dd,1,9,11" ), getTokens( ts ) ); + } + + // 01234567 + // a a + // aa aa + public void test1block2matchLonger() throws IOException { + final String BLOCK = " a a"; + CharStream cs = new PatternReplaceCharFilter( "a", "aa", + CharReader.get( new StringReader( BLOCK ) ) ); + TokenStream ts = new WhitespaceTokenizer( cs ); + assertTokEqualOff( tokens( "aa,1,1,2 aa,1,4,5" ), getTokens( ts ) ); + } + + // 11111 + // 012345678901234 + // aa bb cc dd + // aa#bb dd + public void test1block1matchShorter() throws IOException { + final String BLOCK = "aa bb cc dd"; + CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2", + CharReader.get( new StringReader( BLOCK ) ) ); + TokenStream ts = new WhitespaceTokenizer( cs ); + assertTokEqualOff( tokens( "aa#bb,1,0,11 dd,1,12,14" ), getTokens( ts ) ); + } + + // 111111111122222222223333 + // 0123456789012345678901234567890123 + // aa bb cc --- aa bb aa bb cc + // aa bb cc --- aa bb aa bb cc + public void test1blockMultiMatches() throws IOException { + final String BLOCK = " aa bb cc --- aa bb aa bb cc"; + CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1 $2 $3", + CharReader.get( new StringReader( BLOCK ) ) ); + TokenStream ts = new WhitespaceTokenizer( cs ); + assertTokEqualOff( tokens( "aa,1,2,4 bb,1,6,8 cc,1,9,10 ---,1,11,14 aa,1,15,17 bb,1,18,20 aa,1,21,23 bb,1,25,27 cc,1,29,33" ), + getTokens( ts ) ); + } + + // 11111111112222222222333333333 + // 012345678901234567890123456789012345678 + // aa bb cc --- aa bb aa. bb aa bb cc + // aa##bb cc --- aa##bb aa. bb aa##bb cc + public void test2blocksMultiMatches() throws IOException { + final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc"; + CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)", "$1##$2", ".", + CharReader.get( new StringReader( BLOCK ) ) ); + TokenStream ts = new WhitespaceTokenizer( cs ); + assertTokEqualOff( tokens( "aa##bb,1,2,7 cc,1,8,10 ---,1,11,14 aa##bb,1,15,20 aa.,1,21,24 bb,1,25,27 aa##bb,1,28,35 cc,1,36,38" ), + getTokens( ts ) ); + } + + // 11111111112222222222333333333 + // 012345678901234567890123456789012345678 + // a bb - ccc . --- bb a . ccc ccc bb + // aa b - c . --- b aa . c c b + public void testChain() throws IOException { + final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb"; + CharStream cs = new PatternReplaceCharFilter( "a", "aa", ".", + CharReader.get( new StringReader( BLOCK ) ) ); + cs = new PatternReplaceCharFilter( "bb", "b", ".", cs ); + cs = new PatternReplaceCharFilter( "ccc", "c", ".", cs ); + TokenStream ts = new WhitespaceTokenizer( cs ); + assertTokEqualOff( tokens( "aa,1,1,2 b,1,3,5 -,1,6,7 c,1,8,11 .,1,12,13 ---,1,14,17 b,1,18,20 aa,1,21,22 .,1,23,24 c,1,25,28 c,1,29,32 b,1,33,35" ), + getTokens( ts ) ); + } +}