mirror of https://github.com/apache/lucene.git
SOLR-1653: add PatternReplaceCharFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@890798 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
79b97f996b
commit
8b450730d3
|
@ -69,6 +69,8 @@ New Features
|
|||
|
||||
* SOLR-1532: Allow StreamingUpdateSolrServer to use a provided HttpClient (Gabriele Renzi via shalin)
|
||||
|
||||
* SOLR-1653: Add PatternReplaceCharFilter (koji)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -43,4 +43,20 @@ public abstract class BaseCharFilterFactory implements CharFilterFactory {
|
|||
this.args = args;
|
||||
}
|
||||
|
||||
protected int getInt(String name) {
|
||||
return getInt(name,-1,false);
|
||||
}
|
||||
|
||||
protected int getInt(String name, int defaultVal) {
|
||||
return getInt(name,defaultVal,true);
|
||||
}
|
||||
|
||||
protected int getInt(String name, int defaultVal, boolean useDefault) {
|
||||
String s = args.get(name);
|
||||
if (s==null) {
|
||||
if (useDefault) return defaultVal;
|
||||
throw new RuntimeException("Configuration Error: missing parameter '" + name + "'");
|
||||
}
|
||||
return Integer.parseInt(s);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,193 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.BaseCharFilter;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
|
||||
/**
|
||||
* CharFilter that uses a regular expression for the target of replace string.
|
||||
* The pattern match will be done in each "block" in char stream.
|
||||
*
|
||||
* <p>
|
||||
* ex1) source="aa bb aa bb", pattern="(aa)\\s+(bb)" replacement="$1#$2"<br/>
|
||||
* output="aa#bb aa#bb"
|
||||
* </p>
|
||||
*
|
||||
* NOTE: If you produce a phrase that has different length to source string
|
||||
* and the field is used for highlighting for a term of the phrase, you will
|
||||
* face a trouble.
|
||||
*
|
||||
* <p>
|
||||
* ex2) source="aa123bb", pattern="(aa)\\d+(bb)" replacement="$1 $2"<br/>
|
||||
* output="aa bb"<br/>
|
||||
* and you want to search bb and highlight it, you will get<br/>
|
||||
* highlight snippet="aa1<em>23bb</em>"
|
||||
* </p>
|
||||
*
|
||||
* @version $Id$
|
||||
* @since Solr 1.5
|
||||
*/
|
||||
public class PatternReplaceCharFilter extends BaseCharFilter {
|
||||
|
||||
private final Pattern pattern;
|
||||
private final String replacement;
|
||||
private final int maxBlockChars;
|
||||
private final String blockDelimiters;
|
||||
public static final int DEFAULT_MAX_BLOCK_CHARS = 10000;
|
||||
|
||||
private LinkedList<Character> buffer;
|
||||
private int nextCharCounter;
|
||||
private char[] blockBuffer;
|
||||
private int blockBufferLength;
|
||||
private String replaceBlockBuffer;
|
||||
private int replaceBlockBufferOffset;
|
||||
|
||||
public PatternReplaceCharFilter( String pattern, String replacement, CharStream in ){
|
||||
this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, null, in );
|
||||
}
|
||||
|
||||
public PatternReplaceCharFilter( String pattern, String replacement,
|
||||
int maxBlockChars, CharStream in ){
|
||||
this( pattern, replacement, maxBlockChars, null, in );
|
||||
}
|
||||
|
||||
public PatternReplaceCharFilter( String pattern, String replacement,
|
||||
String blockDelimiters, CharStream in ){
|
||||
this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, blockDelimiters, in );
|
||||
}
|
||||
|
||||
public PatternReplaceCharFilter( String pattern, String replacement,
|
||||
int maxBlockChars, String blockDelimiters, CharStream in ){
|
||||
super( in );
|
||||
this.pattern = Pattern.compile( pattern );
|
||||
this.replacement = replacement;
|
||||
if( maxBlockChars < 1 )
|
||||
throw new IllegalArgumentException( "maxBlockChars should be greater than 0, but it is " + maxBlockChars );
|
||||
this.maxBlockChars = maxBlockChars;
|
||||
this.blockDelimiters = blockDelimiters;
|
||||
blockBuffer = new char[maxBlockChars];
|
||||
}
|
||||
|
||||
private boolean prepareReplaceBlock() throws IOException {
|
||||
while( true ){
|
||||
if( replaceBlockBuffer != null && replaceBlockBuffer.length() > replaceBlockBufferOffset )
|
||||
return true;
|
||||
// prepare block buffer
|
||||
blockBufferLength = 0;
|
||||
while( true ){
|
||||
int c = nextChar();
|
||||
if( c == -1 ) break;
|
||||
blockBuffer[blockBufferLength++] = (char)c;
|
||||
// end of block?
|
||||
boolean foundDelimiter =
|
||||
( blockDelimiters != null ) &&
|
||||
( blockDelimiters.length() > 0 ) &&
|
||||
blockDelimiters.indexOf( c ) >= 0;
|
||||
if( foundDelimiter ||
|
||||
blockBufferLength >= maxBlockChars ) break;
|
||||
}
|
||||
// block buffer available?
|
||||
if( blockBufferLength == 0 ) return false;
|
||||
replaceBlockBuffer = getReplaceBlock( blockBuffer, 0, blockBufferLength );
|
||||
replaceBlockBufferOffset = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public int read() throws IOException {
|
||||
while( prepareReplaceBlock() ){
|
||||
return replaceBlockBuffer.charAt( replaceBlockBufferOffset++ );
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
char[] tmp = new char[len];
|
||||
int l = input.read(tmp, 0, len);
|
||||
if (l != -1) {
|
||||
for(int i = 0; i < l; i++)
|
||||
pushLastChar(tmp[i]);
|
||||
}
|
||||
l = 0;
|
||||
for(int i = off; i < off + len; i++) {
|
||||
int c = read();
|
||||
if (c == -1) break;
|
||||
cbuf[i] = (char) c;
|
||||
l++;
|
||||
}
|
||||
return l == 0 ? -1 : l;
|
||||
}
|
||||
|
||||
private int nextChar() throws IOException {
|
||||
if (buffer != null && !buffer.isEmpty()) {
|
||||
nextCharCounter++;
|
||||
return buffer.removeFirst().charValue();
|
||||
}
|
||||
int c = input.read();
|
||||
if( c != -1 )
|
||||
nextCharCounter++;
|
||||
return c;
|
||||
}
|
||||
|
||||
private void pushLastChar(int c) {
|
||||
if (buffer == null) {
|
||||
buffer = new LinkedList<Character>();
|
||||
}
|
||||
buffer.addLast(new Character((char) c));
|
||||
}
|
||||
|
||||
String getReplaceBlock( String block ){
|
||||
char[] blockChars = block.toCharArray();
|
||||
return getReplaceBlock( blockChars, 0, blockChars.length );
|
||||
}
|
||||
|
||||
String getReplaceBlock( char block[], int offset, int length ){
|
||||
StringBuffer replaceBlock = new StringBuffer();
|
||||
String sourceBlock = new String( block, offset, length );
|
||||
Matcher m = pattern.matcher( sourceBlock );
|
||||
int lastMatchOffset = 0, lastDiff = 0;
|
||||
while( m.find() ){
|
||||
m.appendReplacement( replaceBlock, replacement );
|
||||
// record cumulative diff for the offset correction
|
||||
int diff = replaceBlock.length() - lastMatchOffset - lastDiff - ( m.end( 0 ) - lastMatchOffset );
|
||||
if (diff != 0) {
|
||||
int prevCumulativeDiff = getLastCumulativeDiff();
|
||||
if (diff > 0) {
|
||||
for(int i = 0; i < diff; i++){
|
||||
addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + i - prevCumulativeDiff,
|
||||
prevCumulativeDiff - 1 - i);
|
||||
}
|
||||
} else {
|
||||
addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + diff - prevCumulativeDiff,
|
||||
prevCumulativeDiff - diff);
|
||||
}
|
||||
}
|
||||
// save last offsets
|
||||
lastMatchOffset = m.end( 0 );
|
||||
lastDiff = diff;
|
||||
}
|
||||
// copy remaining of the part of source block
|
||||
m.appendTail( replaceBlock );
|
||||
return replaceBlock.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
|
||||
/**
|
||||
*
|
||||
* @version $Id$
|
||||
* @since Solr 1.5
|
||||
*/
|
||||
public class PatternReplaceCharFilterFactory extends BaseCharFilterFactory {
|
||||
|
||||
private String pattern;
|
||||
private String replacement;
|
||||
private int maxBlockChars;
|
||||
private String blockDelimiters;
|
||||
|
||||
public void init(Map<String, String> args) {
|
||||
super.init( args );
|
||||
pattern = args.get( "pattern" );
|
||||
if( pattern == null )
|
||||
pattern = "";
|
||||
replacement = args.get( "replacement" );
|
||||
if( replacement == null )
|
||||
replacement = "";
|
||||
maxBlockChars = getInt( "maxBlockChars", PatternReplaceCharFilter.DEFAULT_MAX_BLOCK_CHARS );
|
||||
blockDelimiters = args.get( "blockDelimiters" );
|
||||
}
|
||||
|
||||
public CharStream create(CharStream input) {
|
||||
return new PatternReplaceCharFilter( pattern, replacement, maxBlockChars, blockDelimiters, input );
|
||||
}
|
||||
}
|
|
@ -0,0 +1,142 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
*
|
||||
* @version $Id$
|
||||
*
|
||||
*/
|
||||
public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
||||
|
||||
// 1111
|
||||
// 01234567890123
|
||||
// this is test.
|
||||
public void testNothingChange() throws IOException {
|
||||
final String BLOCK = "this is test.";
|
||||
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1$2$3",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "this,1,0,4 is,1,5,7 test.,1,8,13" ), getTokens( ts ) );
|
||||
}
|
||||
|
||||
// 012345678
|
||||
// aa bb cc
|
||||
public void testReplaceByEmpty() throws IOException {
|
||||
final String BLOCK = "aa bb cc";
|
||||
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertEquals( 0, getTokens( ts ).size() );
|
||||
}
|
||||
|
||||
// 012345678
|
||||
// aa bb cc
|
||||
// aa#bb#cc
|
||||
public void test1block1matchSameLength() throws IOException {
|
||||
final String BLOCK = "aa bb cc";
|
||||
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2#$3",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "aa#bb#cc,1,0,8" ), getTokens( ts ) );
|
||||
}
|
||||
|
||||
// 11111
|
||||
// 012345678901234
|
||||
// aa bb cc dd
|
||||
// aa##bb###cc dd
|
||||
public void test1block1matchLonger() throws IOException {
|
||||
final String BLOCK = "aa bb cc dd";
|
||||
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1##$2###$3",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "aa##bb###cc,1,0,8 dd,1,9,11" ), getTokens( ts ) );
|
||||
}
|
||||
|
||||
// 01234567
|
||||
// a a
|
||||
// aa aa
|
||||
public void test1block2matchLonger() throws IOException {
|
||||
final String BLOCK = " a a";
|
||||
CharStream cs = new PatternReplaceCharFilter( "a", "aa",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "aa,1,1,2 aa,1,4,5" ), getTokens( ts ) );
|
||||
}
|
||||
|
||||
// 11111
|
||||
// 012345678901234
|
||||
// aa bb cc dd
|
||||
// aa#bb dd
|
||||
public void test1block1matchShorter() throws IOException {
|
||||
final String BLOCK = "aa bb cc dd";
|
||||
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "aa#bb,1,0,11 dd,1,12,14" ), getTokens( ts ) );
|
||||
}
|
||||
|
||||
// 111111111122222222223333
|
||||
// 0123456789012345678901234567890123
|
||||
// aa bb cc --- aa bb aa bb cc
|
||||
// aa bb cc --- aa bb aa bb cc
|
||||
public void test1blockMultiMatches() throws IOException {
|
||||
final String BLOCK = " aa bb cc --- aa bb aa bb cc";
|
||||
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1 $2 $3",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "aa,1,2,4 bb,1,6,8 cc,1,9,10 ---,1,11,14 aa,1,15,17 bb,1,18,20 aa,1,21,23 bb,1,25,27 cc,1,29,33" ),
|
||||
getTokens( ts ) );
|
||||
}
|
||||
|
||||
// 11111111112222222222333333333
|
||||
// 012345678901234567890123456789012345678
|
||||
// aa bb cc --- aa bb aa. bb aa bb cc
|
||||
// aa##bb cc --- aa##bb aa. bb aa##bb cc
|
||||
public void test2blocksMultiMatches() throws IOException {
|
||||
final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc";
|
||||
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)", "$1##$2", ".",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "aa##bb,1,2,7 cc,1,8,10 ---,1,11,14 aa##bb,1,15,20 aa.,1,21,24 bb,1,25,27 aa##bb,1,28,35 cc,1,36,38" ),
|
||||
getTokens( ts ) );
|
||||
}
|
||||
|
||||
// 11111111112222222222333333333
|
||||
// 012345678901234567890123456789012345678
|
||||
// a bb - ccc . --- bb a . ccc ccc bb
|
||||
// aa b - c . --- b aa . c c b
|
||||
public void testChain() throws IOException {
|
||||
final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb";
|
||||
CharStream cs = new PatternReplaceCharFilter( "a", "aa", ".",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
cs = new PatternReplaceCharFilter( "bb", "b", ".", cs );
|
||||
cs = new PatternReplaceCharFilter( "ccc", "c", ".", cs );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "aa,1,1,2 b,1,3,5 -,1,6,7 c,1,8,11 .,1,12,13 ---,1,14,17 b,1,18,20 aa,1,21,22 .,1,23,24 c,1,25,28 c,1,29,32 b,1,33,35" ),
|
||||
getTokens( ts ) );
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue