SOLR-1653: add PatternReplaceCharFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@890798 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Koji Sekiguchi 2009-12-15 14:13:59 +00:00
parent 79b97f996b
commit 8b450730d3
5 changed files with 404 additions and 0 deletions

View File

@ -69,6 +69,8 @@ New Features
* SOLR-1532: Allow StreamingUpdateSolrServer to use a provided HttpClient (Gabriele Renzi via shalin)
* SOLR-1653: Add PatternReplaceCharFilter (koji)
Optimizations
----------------------

View File

@ -43,4 +43,20 @@ public abstract class BaseCharFilterFactory implements CharFilterFactory {
this.args = args;
}
protected int getInt(String name) {
return getInt(name,-1,false);
}
protected int getInt(String name, int defaultVal) {
return getInt(name,defaultVal,true);
}
protected int getInt(String name, int defaultVal, boolean useDefault) {
String s = args.get(name);
if (s==null) {
if (useDefault) return defaultVal;
throw new RuntimeException("Configuration Error: missing parameter '" + name + "'");
}
return Integer.parseInt(s);
}
}

View File

@ -0,0 +1,193 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.IOException;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.BaseCharFilter;
import org.apache.lucene.analysis.CharStream;
/**
* CharFilter that uses a regular expression for the target of replace string.
* The pattern match will be done in each "block" in char stream.
*
* <p>
* ex1) source="aa&nbsp;&nbsp;bb&nbsp;aa&nbsp;bb", pattern="(aa)\\s+(bb)" replacement="$1#$2"<br/>
* output="aa#bb&nbsp;aa#bb"
* </p>
*
* NOTE: If you produce a phrase that has different length to source string
* and the field is used for highlighting for a term of the phrase, you will
* face a trouble.
*
* <p>
* ex2) source="aa123bb", pattern="(aa)\\d+(bb)" replacement="$1&nbsp;$2"<br/>
* output="aa&nbsp;bb"<br/>
* and you want to search bb and highlight it, you will get<br/>
* highlight snippet="aa1&lt;em&gt;23bb&lt;/em&gt;"
* </p>
*
* @version $Id$
* @since Solr 1.5
*/
public class PatternReplaceCharFilter extends BaseCharFilter {
private final Pattern pattern;
private final String replacement;
private final int maxBlockChars;
private final String blockDelimiters;
public static final int DEFAULT_MAX_BLOCK_CHARS = 10000;
private LinkedList<Character> buffer;
private int nextCharCounter;
private char[] blockBuffer;
private int blockBufferLength;
private String replaceBlockBuffer;
private int replaceBlockBufferOffset;
public PatternReplaceCharFilter( String pattern, String replacement, CharStream in ){
this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, null, in );
}
public PatternReplaceCharFilter( String pattern, String replacement,
int maxBlockChars, CharStream in ){
this( pattern, replacement, maxBlockChars, null, in );
}
public PatternReplaceCharFilter( String pattern, String replacement,
String blockDelimiters, CharStream in ){
this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, blockDelimiters, in );
}
public PatternReplaceCharFilter( String pattern, String replacement,
int maxBlockChars, String blockDelimiters, CharStream in ){
super( in );
this.pattern = Pattern.compile( pattern );
this.replacement = replacement;
if( maxBlockChars < 1 )
throw new IllegalArgumentException( "maxBlockChars should be greater than 0, but it is " + maxBlockChars );
this.maxBlockChars = maxBlockChars;
this.blockDelimiters = blockDelimiters;
blockBuffer = new char[maxBlockChars];
}
private boolean prepareReplaceBlock() throws IOException {
while( true ){
if( replaceBlockBuffer != null && replaceBlockBuffer.length() > replaceBlockBufferOffset )
return true;
// prepare block buffer
blockBufferLength = 0;
while( true ){
int c = nextChar();
if( c == -1 ) break;
blockBuffer[blockBufferLength++] = (char)c;
// end of block?
boolean foundDelimiter =
( blockDelimiters != null ) &&
( blockDelimiters.length() > 0 ) &&
blockDelimiters.indexOf( c ) >= 0;
if( foundDelimiter ||
blockBufferLength >= maxBlockChars ) break;
}
// block buffer available?
if( blockBufferLength == 0 ) return false;
replaceBlockBuffer = getReplaceBlock( blockBuffer, 0, blockBufferLength );
replaceBlockBufferOffset = 0;
}
}
public int read() throws IOException {
while( prepareReplaceBlock() ){
return replaceBlockBuffer.charAt( replaceBlockBufferOffset++ );
}
return -1;
}
public int read(char[] cbuf, int off, int len) throws IOException {
char[] tmp = new char[len];
int l = input.read(tmp, 0, len);
if (l != -1) {
for(int i = 0; i < l; i++)
pushLastChar(tmp[i]);
}
l = 0;
for(int i = off; i < off + len; i++) {
int c = read();
if (c == -1) break;
cbuf[i] = (char) c;
l++;
}
return l == 0 ? -1 : l;
}
private int nextChar() throws IOException {
if (buffer != null && !buffer.isEmpty()) {
nextCharCounter++;
return buffer.removeFirst().charValue();
}
int c = input.read();
if( c != -1 )
nextCharCounter++;
return c;
}
private void pushLastChar(int c) {
if (buffer == null) {
buffer = new LinkedList<Character>();
}
buffer.addLast(new Character((char) c));
}
String getReplaceBlock( String block ){
char[] blockChars = block.toCharArray();
return getReplaceBlock( blockChars, 0, blockChars.length );
}
String getReplaceBlock( char block[], int offset, int length ){
StringBuffer replaceBlock = new StringBuffer();
String sourceBlock = new String( block, offset, length );
Matcher m = pattern.matcher( sourceBlock );
int lastMatchOffset = 0, lastDiff = 0;
while( m.find() ){
m.appendReplacement( replaceBlock, replacement );
// record cumulative diff for the offset correction
int diff = replaceBlock.length() - lastMatchOffset - lastDiff - ( m.end( 0 ) - lastMatchOffset );
if (diff != 0) {
int prevCumulativeDiff = getLastCumulativeDiff();
if (diff > 0) {
for(int i = 0; i < diff; i++){
addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + i - prevCumulativeDiff,
prevCumulativeDiff - 1 - i);
}
} else {
addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + diff - prevCumulativeDiff,
prevCumulativeDiff - diff);
}
}
// save last offsets
lastMatchOffset = m.end( 0 );
lastDiff = diff;
}
// copy remaining of the part of source block
m.appendTail( replaceBlock );
return replaceBlock.toString();
}
}

View File

@ -0,0 +1,51 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.util.Map;
import org.apache.lucene.analysis.CharStream;
/**
*
* @version $Id$
* @since Solr 1.5
*/
public class PatternReplaceCharFilterFactory extends BaseCharFilterFactory {
private String pattern;
private String replacement;
private int maxBlockChars;
private String blockDelimiters;
public void init(Map<String, String> args) {
super.init( args );
pattern = args.get( "pattern" );
if( pattern == null )
pattern = "";
replacement = args.get( "replacement" );
if( replacement == null )
replacement = "";
maxBlockChars = getInt( "maxBlockChars", PatternReplaceCharFilter.DEFAULT_MAX_BLOCK_CHARS );
blockDelimiters = args.get( "blockDelimiters" );
}
public CharStream create(CharStream input) {
return new PatternReplaceCharFilter( pattern, replacement, maxBlockChars, blockDelimiters, input );
}
}

View File

@ -0,0 +1,142 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
*
* @version $Id$
*
*/
public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
// 1111
// 01234567890123
// this is test.
public void testNothingChange() throws IOException {
final String BLOCK = "this is test.";
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1$2$3",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "this,1,0,4 is,1,5,7 test.,1,8,13" ), getTokens( ts ) );
}
// 012345678
// aa bb cc
public void testReplaceByEmpty() throws IOException {
final String BLOCK = "aa bb cc";
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertEquals( 0, getTokens( ts ).size() );
}
// 012345678
// aa bb cc
// aa#bb#cc
public void test1block1matchSameLength() throws IOException {
final String BLOCK = "aa bb cc";
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2#$3",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa#bb#cc,1,0,8" ), getTokens( ts ) );
}
// 11111
// 012345678901234
// aa bb cc dd
// aa##bb###cc dd
public void test1block1matchLonger() throws IOException {
final String BLOCK = "aa bb cc dd";
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1##$2###$3",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa##bb###cc,1,0,8 dd,1,9,11" ), getTokens( ts ) );
}
// 01234567
// a a
// aa aa
public void test1block2matchLonger() throws IOException {
final String BLOCK = " a a";
CharStream cs = new PatternReplaceCharFilter( "a", "aa",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa,1,1,2 aa,1,4,5" ), getTokens( ts ) );
}
// 11111
// 012345678901234
// aa bb cc dd
// aa#bb dd
public void test1block1matchShorter() throws IOException {
final String BLOCK = "aa bb cc dd";
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa#bb,1,0,11 dd,1,12,14" ), getTokens( ts ) );
}
// 111111111122222222223333
// 0123456789012345678901234567890123
// aa bb cc --- aa bb aa bb cc
// aa bb cc --- aa bb aa bb cc
public void test1blockMultiMatches() throws IOException {
final String BLOCK = " aa bb cc --- aa bb aa bb cc";
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1 $2 $3",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa,1,2,4 bb,1,6,8 cc,1,9,10 ---,1,11,14 aa,1,15,17 bb,1,18,20 aa,1,21,23 bb,1,25,27 cc,1,29,33" ),
getTokens( ts ) );
}
// 11111111112222222222333333333
// 012345678901234567890123456789012345678
// aa bb cc --- aa bb aa. bb aa bb cc
// aa##bb cc --- aa##bb aa. bb aa##bb cc
public void test2blocksMultiMatches() throws IOException {
final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc";
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)", "$1##$2", ".",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa##bb,1,2,7 cc,1,8,10 ---,1,11,14 aa##bb,1,15,20 aa.,1,21,24 bb,1,25,27 aa##bb,1,28,35 cc,1,36,38" ),
getTokens( ts ) );
}
// 11111111112222222222333333333
// 012345678901234567890123456789012345678
// a bb - ccc . --- bb a . ccc ccc bb
// aa b - c . --- b aa . c c b
public void testChain() throws IOException {
final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb";
CharStream cs = new PatternReplaceCharFilter( "a", "aa", ".",
CharReader.get( new StringReader( BLOCK ) ) );
cs = new PatternReplaceCharFilter( "bb", "b", ".", cs );
cs = new PatternReplaceCharFilter( "ccc", "c", ".", cs );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa,1,1,2 b,1,3,5 -,1,6,7 c,1,8,11 .,1,12,13 ---,1,14,17 b,1,18,20 aa,1,21,22 .,1,23,24 c,1,25,28 c,1,29,32 b,1,33,35" ),
getTokens( ts ) );
}
}