SOLR-1398: Add offset corrections in PatternTokenizerFactory

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@811753 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Koji Sekiguchi 2009-09-06 03:45:42 +00:00
parent c829e36d23
commit 5324b752cb
3 changed files with 65 additions and 11 deletions

View File

@ -523,6 +523,9 @@ Bug Fixes
include the "1" (ie: 2**0) bucket in the term histogram data.
(hossman)
63. SOLR-1398: Add offset corrections in PatternTokenizerFactory.
(Anders Melchiorsen, koji)
Other Changes
----------------------
1. Upgraded to Lucene 2.4.0 (yonik)

View File

@ -27,6 +27,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.solr.common.SolrException;
@ -112,8 +113,8 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
Matcher matcher = pattern.matcher( str );
tokens = (group < 0 )
? split( matcher, str )
: group( matcher, str, group );
? split( matcher, str, input )
: group( matcher, str, group, input );
iter = tokens.iterator();
}
@ -151,12 +152,19 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, ex );
}
}
/**
* @deprecated
*/
public static List<Token> split( Matcher matcher, String input ){
return split(matcher,input,null);
}
/**
* This behaves just like String.split( ), but returns a list of Tokens
* rather then an array of strings
*/
public static List<Token> split( Matcher matcher, String input )
public static List<Token> split( Matcher matcher, String input, CharStream stream )
{
int index = 0;
int lastNonEmptySize = Integer.MAX_VALUE;
@ -165,7 +173,7 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
// Add segments before each match found
while(matcher.find()) {
String match = input.subSequence(index, matcher.start()).toString();
matchList.add( new Token( match, index, matcher.start()) );
matchList.add( newToken( stream, match, index, matcher.start()) );
index = matcher.end();
if( match.length() > 0 ) {
lastNonEmptySize = matchList.size();
@ -174,11 +182,11 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
// If no match is found, return the full string
if (index == 0) {
matchList.add( new Token( input, 0, input.length()) );
matchList.add( newToken( stream, input, 0, input.length()) );
}
else {
String match = input.subSequence(index, input.length()).toString();
matchList.add( new Token( match, index, input.length()) );
matchList.add( newToken( stream, match, index, input.length()) );
if( match.length() > 0 ) {
lastNonEmptySize = matchList.size();
}
@ -192,14 +200,21 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
}
/**
* @deprecated
*/
public static List<Token> group( Matcher matcher, String input, int group ){
return group(matcher, input, group, null);
}
/**
* Create tokens from the matches in a matcher
*/
public static List<Token> group( Matcher matcher, String input, int group )
public static List<Token> group( Matcher matcher, String input, int group, CharStream stream )
{
ArrayList<Token> matchList = new ArrayList<Token>();
while(matcher.find()) {
Token t = new Token(
Token t = newToken( stream,
matcher.group(group),
matcher.start(group),
matcher.end(group) );
@ -207,4 +222,13 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
}
return matchList;
}
private static Token newToken( CharStream stream, String text, int start, int end ){
Token token = null;
if( stream != null )
token = new Token( text, stream.correctOffset( start ), stream.correctOffset( end ) );
else
token = new Token( text, start, end );
return token;
}
}

View File

@ -18,15 +18,19 @@
package org.apache.solr.analysis;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import junit.framework.TestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.MappingCharFilter;
import org.apache.lucene.analysis.NormalizeCharMap;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
public class TestPatternTokenizerFactory extends AnalysisTestCase
public class TestPatternTokenizerFactory extends BaseTokenTestCase
{
public void testSplitting() throws Exception
{
@ -70,4 +74,27 @@ public class TestPatternTokenizerFactory extends AnalysisTestCase
}
}
}
public void testOffsetCorrection() throws Exception {
final String INPUT = "G&uuml;nther G&uuml;nther is here";
// create MappingCharFilter
MappingCharFilterFactory cfFactory = new MappingCharFilterFactory();
List<String> mappingRules = new ArrayList<String>();
mappingRules.add( "\"&uuml;\" => \"ü\"" );
NormalizeCharMap normMap = new NormalizeCharMap();
cfFactory.parseRules( mappingRules, normMap );
CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
// create PatternTokenizer
Map<String,String> args = new HashMap<String, String>();
args.put( PatternTokenizerFactory.PATTERN, "[,;/\\s]+" );
PatternTokenizerFactory tokFactory = new PatternTokenizerFactory();
tokFactory.init( args );
TokenStream stream = tokFactory.create( charStream );
List<Token> result = getTokens( stream );
List<Token> expect = tokens( "Günther,1,0,12 Günther,1,13,25 is,1,26,28 here,1,29,33" );
assertTokEqualOff( expect, result );
}
}