mirror of https://github.com/apache/lucene.git
SOLR-1398: Add offset corrections in PatternTokenizerFactory
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@811753 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c829e36d23
commit
5324b752cb
|
@ -523,6 +523,9 @@ Bug Fixes
|
|||
include the "1" (ie: 2**0) bucket in the term histogram data.
|
||||
(hossman)
|
||||
|
||||
63. SOLR-1398: Add offset corrections in PatternTokenizerFactory.
|
||||
(Anders Melchiorsen, koji)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
1. Upgraded to Lucene 2.4.0 (yonik)
|
||||
|
|
|
@ -27,6 +27,7 @@ import java.util.regex.Matcher;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.solr.common.SolrException;
|
||||
|
@ -112,8 +113,8 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
|
|||
|
||||
Matcher matcher = pattern.matcher( str );
|
||||
tokens = (group < 0 )
|
||||
? split( matcher, str )
|
||||
: group( matcher, str, group );
|
||||
? split( matcher, str, input )
|
||||
: group( matcher, str, group, input );
|
||||
iter = tokens.iterator();
|
||||
}
|
||||
|
||||
|
@ -151,12 +152,19 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
|
|||
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, ex );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated
|
||||
*/
|
||||
public static List<Token> split( Matcher matcher, String input ){
|
||||
return split(matcher,input,null);
|
||||
}
|
||||
|
||||
/**
|
||||
* This behaves just like String.split( ), but returns a list of Tokens
|
||||
* rather then an array of strings
|
||||
*/
|
||||
public static List<Token> split( Matcher matcher, String input )
|
||||
public static List<Token> split( Matcher matcher, String input, CharStream stream )
|
||||
{
|
||||
int index = 0;
|
||||
int lastNonEmptySize = Integer.MAX_VALUE;
|
||||
|
@ -165,7 +173,7 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
|
|||
// Add segments before each match found
|
||||
while(matcher.find()) {
|
||||
String match = input.subSequence(index, matcher.start()).toString();
|
||||
matchList.add( new Token( match, index, matcher.start()) );
|
||||
matchList.add( newToken( stream, match, index, matcher.start()) );
|
||||
index = matcher.end();
|
||||
if( match.length() > 0 ) {
|
||||
lastNonEmptySize = matchList.size();
|
||||
|
@ -174,11 +182,11 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
|
|||
|
||||
// If no match is found, return the full string
|
||||
if (index == 0) {
|
||||
matchList.add( new Token( input, 0, input.length()) );
|
||||
matchList.add( newToken( stream, input, 0, input.length()) );
|
||||
}
|
||||
else {
|
||||
String match = input.subSequence(index, input.length()).toString();
|
||||
matchList.add( new Token( match, index, input.length()) );
|
||||
matchList.add( newToken( stream, match, index, input.length()) );
|
||||
if( match.length() > 0 ) {
|
||||
lastNonEmptySize = matchList.size();
|
||||
}
|
||||
|
@ -192,14 +200,21 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* @deprecated
|
||||
*/
|
||||
public static List<Token> group( Matcher matcher, String input, int group ){
|
||||
return group(matcher, input, group, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create tokens from the matches in a matcher
|
||||
*/
|
||||
public static List<Token> group( Matcher matcher, String input, int group )
|
||||
public static List<Token> group( Matcher matcher, String input, int group, CharStream stream )
|
||||
{
|
||||
ArrayList<Token> matchList = new ArrayList<Token>();
|
||||
while(matcher.find()) {
|
||||
Token t = new Token(
|
||||
Token t = newToken( stream,
|
||||
matcher.group(group),
|
||||
matcher.start(group),
|
||||
matcher.end(group) );
|
||||
|
@ -207,4 +222,13 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
|
|||
}
|
||||
return matchList;
|
||||
}
|
||||
|
||||
private static Token newToken( CharStream stream, String text, int start, int end ){
|
||||
Token token = null;
|
||||
if( stream != null )
|
||||
token = new Token( text, stream.correctOffset( start ), stream.correctOffset( end ) );
|
||||
else
|
||||
token = new Token( text, start, end );
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,15 +18,19 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.MappingCharFilter;
|
||||
import org.apache.lucene.analysis.NormalizeCharMap;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
public class TestPatternTokenizerFactory extends AnalysisTestCase
|
||||
public class TestPatternTokenizerFactory extends BaseTokenTestCase
|
||||
{
|
||||
public void testSplitting() throws Exception
|
||||
{
|
||||
|
@ -70,4 +74,27 @@ public class TestPatternTokenizerFactory extends AnalysisTestCase
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testOffsetCorrection() throws Exception {
|
||||
final String INPUT = "Günther Günther is here";
|
||||
|
||||
// create MappingCharFilter
|
||||
MappingCharFilterFactory cfFactory = new MappingCharFilterFactory();
|
||||
List<String> mappingRules = new ArrayList<String>();
|
||||
mappingRules.add( "\"ü\" => \"ü\"" );
|
||||
NormalizeCharMap normMap = new NormalizeCharMap();
|
||||
cfFactory.parseRules( mappingRules, normMap );
|
||||
CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
|
||||
|
||||
// create PatternTokenizer
|
||||
Map<String,String> args = new HashMap<String, String>();
|
||||
args.put( PatternTokenizerFactory.PATTERN, "[,;/\\s]+" );
|
||||
PatternTokenizerFactory tokFactory = new PatternTokenizerFactory();
|
||||
tokFactory.init( args );
|
||||
TokenStream stream = tokFactory.create( charStream );
|
||||
|
||||
List<Token> result = getTokens( stream );
|
||||
List<Token> expect = tokens( "Günther,1,0,12 Günther,1,13,25 is,1,26,28 here,1,29,33" );
|
||||
assertTokEqualOff( expect, result );
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue