SOLR-1398: Add offset corrections in PatternTokenizerFactory

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@811753 13f79535-47bb-0310-9956-ffa450edef68
2009-09-06 03:45:42 +00:00 · 2009-09-06 03:45:42 +00:00 · 5324b752cb
parent c829e36d23
commit 5324b752cb
3 changed files with 65 additions and 11 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -523,6 +523,9 @@ Bug Fixes
    include the "1" (ie: 2**0) bucket in the term histogram data. 
    (hossman)

+63. SOLR-1398: Add offset corrections in PatternTokenizerFactory.
+    (Anders Melchiorsen, koji)
+
 Other Changes
 ----------------------
 1. Upgraded to Lucene 2.4.0 (yonik)
--- a/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
+++ b/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
@ -27,6 +27,7 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.solr.common.SolrException;
@ -112,8 +113,8 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory

          Matcher matcher = pattern.matcher( str );
          tokens = (group < 0 )
-                  ? split( matcher, str )
-                  : group( matcher, str, group );
+                  ? split( matcher, str, input )
+                  : group( matcher, str, group, input );
          iter = tokens.iterator();
        }

@ -151,12 +152,19 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
      throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, ex );
    }
  }
+
+  /**
+   * @deprecated
+   */
+  public static List<Token> split( Matcher matcher, String input ){
+    return split(matcher,input,null);
+  }
  
  /**
   * This behaves just like String.split( ), but returns a list of Tokens
   * rather then an array of strings
   */
-  public static List<Token> split( Matcher matcher, String input )
+  public static List<Token> split( Matcher matcher, String input, CharStream stream )
  {
    int index = 0;
    int lastNonEmptySize = Integer.MAX_VALUE;
@ -165,7 +173,7 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
    // Add segments before each match found
    while(matcher.find()) {
      String match = input.subSequence(index, matcher.start()).toString();
-      matchList.add( new Token( match, index, matcher.start()) );
+      matchList.add( newToken( stream, match, index, matcher.start()) );
      index = matcher.end();
      if( match.length() > 0 ) {
        lastNonEmptySize = matchList.size();
@ -174,11 +182,11 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory

    // If no match is found, return the full string
    if (index == 0) {
-      matchList.add( new Token( input, 0, input.length()) );
+      matchList.add( newToken( stream, input, 0, input.length()) );
    }
    else { 
      String match = input.subSequence(index, input.length()).toString();
-      matchList.add( new Token( match, index, input.length()) );
+      matchList.add( newToken( stream, match, index, input.length()) );
      if( match.length() > 0 ) {
        lastNonEmptySize = matchList.size();
      }
@ -192,14 +200,21 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
  }
  

+  /**
+   * @deprecated
+   */
+  public static List<Token> group( Matcher matcher, String input, int group ){
+    return group(matcher, input, group, null);
+  }
+  
  /**
   * Create tokens from the matches in a matcher 
   */
-  public static List<Token> group( Matcher matcher, String input, int group )
+  public static List<Token> group( Matcher matcher, String input, int group, CharStream stream )
  {
    ArrayList<Token> matchList = new ArrayList<Token>();
    while(matcher.find()) {
-      Token t = new Token( 
+      Token t = newToken( stream,
        matcher.group(group), 
        matcher.start(group), 
        matcher.end(group) );
@ -207,4 +222,13 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
    }
    return matchList;
  }
+  
+  private static Token newToken( CharStream stream, String text, int start, int end ){
+    Token token = null;
+    if( stream != null )
+      token = new Token( text, stream.correctOffset( start ), stream.correctOffset( end ) );
+    else
+      token = new Token( text, start, end );
+    return token;
+  }
 }
--- a/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
+++ b/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
@ -18,15 +18,19 @@
 package org.apache.solr.analysis;

 import java.io.StringReader;
+import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;

-import junit.framework.TestCase;
-
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.MappingCharFilter;
+import org.apache.lucene.analysis.NormalizeCharMap;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;

-public class TestPatternTokenizerFactory extends AnalysisTestCase 
+public class TestPatternTokenizerFactory extends BaseTokenTestCase 
 {
 	public void testSplitting() throws Exception 
  {
@ -70,4 +74,27 @@ public class TestPatternTokenizerFactory extends AnalysisTestCase
      }
    } 
 	}
+	
+  public void testOffsetCorrection() throws Exception {
+    final String INPUT = "G&uuml;nther G&uuml;nther is here";
+
+    // create MappingCharFilter
+    MappingCharFilterFactory cfFactory = new MappingCharFilterFactory();
+    List<String> mappingRules = new ArrayList<String>();
+    mappingRules.add( "\"&uuml;\" => \"ü\"" );
+    NormalizeCharMap normMap = new NormalizeCharMap();
+    cfFactory.parseRules( mappingRules, normMap );
+    CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
+
+    // create PatternTokenizer
+    Map<String,String> args = new HashMap<String, String>();
+    args.put( PatternTokenizerFactory.PATTERN, "[,;/\\s]+" );
+    PatternTokenizerFactory tokFactory = new PatternTokenizerFactory();
+    tokFactory.init( args );
+    TokenStream stream = tokFactory.create( charStream );
+
+    List<Token> result = getTokens( stream );
+    List<Token> expect = tokens( "Günther,1,0,12 Günther,1,13,25 is,1,26,28 here,1,29,33" );
+    assertTokEqualOff( expect, result );
+  }
 }