SOLR-1423: Use Tokenizer.correctOffset() instead of CharStream.correctOffset()

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@816502 13f79535-47bb-0310-9956-ffa450edef68
2009-09-18 07:37:22 +00:00 · 2009-09-18 07:37:22 +00:00 · aa02a6a5c8
parent 7b914a84f0
commit aa02a6a5c8
5 changed files with 178 additions and 92 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -691,6 +691,9 @@ Other Changes
    RussianLowerCaseFilterFactory or RussianLetterTokenizerFactory.
    (Robert Muir via hossman)

+48. SOLR-1423: Due to LUCENE-1906, Solr's tokenizer should use Tokenizer.correctOffset() instead of CharStream.correctOffset().
+    (Uwe Schindler via koji)
+

 Build
 ----------------------
--- a/src/java/org/apache/solr/analysis/PatternTokenizer.java
+++ b/src/java/org/apache/solr/analysis/PatternTokenizer.java
@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.commons.io.IOUtils;
+
+/**
+ * This tokenizer uses regex pattern matching to construct distinct tokens
+ * for the input stream.  It takes two arguments:  "pattern" and "group".
+ * <p/>
+ * <ul>
+ * <li>"pattern" is the regular expression.</li>
+ * <li>"group" says which group to extract into tokens.</li>
+ *  </ul>
+ * <p>
+ * group=-1 (the default) is equivalent to "split".  In this case, the tokens will
+ * be equivalent to the output from (without empty tokens):
+ * {@link String#split(java.lang.String)}
+ * </p>
+ * <p>
+ * Using group >= 0 selects the matching group as the token.  For example, if you have:<br/>
+ * <pre>
+ *  pattern = \'([^\']+)\'
+ *  group = 0
+ *  input = aaa 'bbb' 'ccc'
+ *</pre>
+ * the output will be two tokens: 'bbb' and 'ccc' (including the ' marks).  With the same input
+ * but using group=1, the output would be: bbb and ccc (no ' marks)
+ * </p>
+ * <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p>
+ *
+ * @version $Id$
+ * @see Pattern
+ */
+public final class PatternTokenizer extends Tokenizer {
+
+  private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+  private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+
+  private String str;
+  private int index;
+  
+  private final Pattern pattern;
+  private final int group;
+  private final Matcher matcher;
+
+  /** creates a new PatternTokenizer returning tokens from group (-1 for split functionality) */
+  public PatternTokenizer(Reader input, Pattern pattern, int group) throws IOException {
+    super(input);
+    this.pattern = pattern;
+    this.group = group;
+    str = IOUtils.toString(input);
+    matcher = pattern.matcher(str);
+    index = 0;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (index >= str.length()) return false;
+    
+    if (group >= 0) {
+    
+      // match a specific group
+      while (matcher.find()) {
+        final String match = matcher.group(group);
+        if (match.length() == 0) continue;
+        termAtt.setTermBuffer(match);
+        index = matcher.start(group);
+        offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.end(group)));
+        return true;
+      }
+      
+      index = Integer.MAX_VALUE; // mark exhausted
+      return false;
+      
+    } else {
+    
+      // String.split() functionality
+      while (matcher.find()) {
+        if (matcher.start() - index > 0) {
+          // found a non-zero-length token
+          termAtt.setTermBuffer(str, index, matcher.start() - index);
+          offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start()));
+          index = matcher.end();
+          return true;
+        }
+        
+        index = matcher.end();
+      }
+      
+      if (str.length() - index == 0) {
+        index = Integer.MAX_VALUE; // mark exhausted
+        return false;
+      }
+      
+      termAtt.setTermBuffer(str, index, str.length() - index);
+      offsetAtt.setOffset(correctOffset(index), correctOffset(str.length()));
+      index = Integer.MAX_VALUE; // mark exhausted
+      return true;
+    }
+  }
+
+  @Override
+  public void end() throws IOException {
+    final int ofs = correctOffset(str.length());
+    offsetAtt.setOffset(ofs, ofs);
+  }
+
+  @Override
+  public void reset(Reader input) throws IOException {
+    super.reset(input);
+    str = IOUtils.toString(input);
+    matcher.reset(str);
+    index = 0;
+  }
+
+}
--- a/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
+++ b/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
@ -20,14 +20,11 @@ package org.apache.solr.analysis;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.ArrayList;
-import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

-import org.apache.commons.io.IOUtils;
-import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.solr.common.SolrException;
@ -43,9 +40,8 @@ import org.apache.solr.common.SolrException;
 *  </ul>
 * <p>
 * group=-1 (the default) is equivalent to "split".  In this case, the tokens will
- * be equivalent to the output from:
- *
- * http://java.sun.com/j2se/1.4.2/docs/api/java/lang/String.html#split(java.lang.String)
+ * be equivalent to the output from (without empty tokens):
+ * {@link String#split(java.lang.String)}
 * </p>
 * <p>
 * Using group >= 0 selects the matching group as the token.  For example, if you have:<br/>
@ -57,7 +53,9 @@ import org.apache.solr.common.SolrException;
 * the output will be two tokens: 'bbb' and 'ccc' (including the ' marks).  With the same input
 * but using group=1, the output would be: bbb and ccc (no ' marks)
 * </p>
+ * <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p>
 *
+ * @see PatternTokenizer
 * @since solr1.2
 * @version $Id:$
 */
@ -101,70 +99,20 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
   */
  public Tokenizer create(final Reader in) {
    try {
-      return new Tokenizer(in) {
-        {init();}
-
-        List<Token> tokens;
-        Iterator<Token> iter;
-
-        void init() throws IOException {
-          // Read the input into a single string
-          String str = IOUtils.toString( input );
-
-          Matcher matcher = pattern.matcher( str );
-          tokens = (group < 0 )
-                  ? split( matcher, str, input )
-                  : group( matcher, str, group, input );
-          iter = tokens.iterator();
-        }
-
-//        @Override
-//        public boolean incrementToken() throws IOException {
-//          return super.incrementToken();
-//        }
-
-        @Override
-        public void end() throws IOException {
-          super.end();
-        }
-
-//        @Override
-//        public Token next(Token reusableToken) throws IOException {
-//          return super.next(reusableToken);
-//        }
-
-        @Override
-        public void reset(Reader input) throws IOException {
-          super.reset(input);
-          init();
-        }
-
-        @Override
-        public Token next() throws IOException {
-          if( iter.hasNext() ) {
-            return iter.next();
-          }
-          return null;
-        }
-      };
-    }
-    catch( IOException ex ) {
+      return new PatternTokenizer(in, pattern, group);
+    } catch( IOException ex ) {
      throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, ex );
    }
  }
-
-  /**
-   * @deprecated
-   */
-  public static List<Token> split( Matcher matcher, String input ){
-    return split(matcher,input,null);
-  }
  
  /**
   * This behaves just like String.split( ), but returns a list of Tokens
   * rather then an array of strings
+   * NOTE: This method is not used in 1.4.
+   * @deprecated
   */
-  public static List<Token> split( Matcher matcher, String input, Reader stream )
+  @Deprecated
+  public static List<Token> split( Matcher matcher, String input )
  {
    int index = 0;
    int lastNonEmptySize = Integer.MAX_VALUE;
@ -173,7 +121,7 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
    // Add segments before each match found
    while(matcher.find()) {
      String match = input.subSequence(index, matcher.start()).toString();
-      matchList.add( newToken( stream, match, index, matcher.start()) );
+      matchList.add( new Token( match, index, matcher.start()) );
      index = matcher.end();
      if( match.length() > 0 ) {
        lastNonEmptySize = matchList.size();
@ -182,11 +130,11 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory

    // If no match is found, return the full string
    if (index == 0) {
-      matchList.add( newToken( stream, input, 0, input.length()) );
+      matchList.add( new Token( input, 0, input.length()) );
    }
    else { 
      String match = input.subSequence(index, input.length()).toString();
-      matchList.add( newToken( stream, match, index, input.length()) );
+      matchList.add( new Token( match, index, input.length()) );
      if( match.length() > 0 ) {
        lastNonEmptySize = matchList.size();
      }
@ -199,22 +147,17 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
    return matchList;
  }
  
-
-  /**
-   * @deprecated
-   */
-  public static List<Token> group( Matcher matcher, String input, int group ){
-    return group(matcher, input, group, null);
-  }
-  
  /**
   * Create tokens from the matches in a matcher 
+   * NOTE: This method is not used in 1.4.
+   * @deprecated
   */
-  public static List<Token> group( Matcher matcher, String input, int group, Reader stream )
+  @Deprecated
+  public static List<Token> group( Matcher matcher, String input, int group )
  {
    ArrayList<Token> matchList = new ArrayList<Token>();
    while(matcher.find()) {
-      Token t = newToken( stream,
+      Token t = new Token( 
        matcher.group(group), 
        matcher.start(group), 
        matcher.end(group) );
@ -222,15 +165,4 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
    }
    return matchList;
  }
-  
-  private static Token newToken(Reader reader, String text, int start, int end ){
-    Token token;
-    if( reader instanceof CharStream) {
-      CharStream stream = (CharStream)reader;
-      token = new Token( text, stream.correctOffset( start ), stream.correctOffset( end ) );
-    } else {
-      token = new Token( text, start, end );
-    }
-    return token;
-  }
 }
--- a/src/java/org/apache/solr/schema/FieldType.java
+++ b/src/java/org/apache/solr/schema/FieldType.java
@ -311,7 +311,7 @@ public abstract class FieldType extends FieldProperties {
          if (n<=0) return false;
          String s = toInternal(new String(cbuf,0,n));
          termAtt.setTermBuffer(s);
-          offsetAtt.setOffset(0,n);
+          offsetAtt.setOffset(correctOffset(0),correctOffset(n));
          return true;
        }       
      };
--- a/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
+++ b/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
@ -39,9 +39,9 @@ public class TestPatternTokenizerFactory extends BaseTokenTestCase
      // group  pattern        input                    output
      { "-1",   "--",          "aaa--bbb--ccc",         "aaa bbb ccc" },
      { "-1",   ":",           "aaa:bbb:ccc",           "aaa bbb ccc" },
-      { "-1",   "\\p{Space}",  "aaa   bbb \t\tccc  ",   "aaa   bbb   ccc" },
+      { "-1",   "\\p{Space}",  "aaa   bbb \t\tccc  ",   "aaa bbb ccc" },
      { "-1",   ":",           "boo:and:foo",           "boo and foo" },
-      { "-1",   "o",           "boo:and:foo",           "b  :and:f" },
+      { "-1",   "o",           "boo:and:foo",           "b :and:f" },
      { "0",    ":",           "boo:and:foo",           ": :" },
      { "0",    qpattern,      "aaa 'bbb' 'ccc'",       "'bbb' 'ccc'" },
      { "1",    qpattern,      "aaa 'bbb' 'ccc'",       "bbb ccc" }
@ -60,10 +60,11 @@ public class TestPatternTokenizerFactory extends BaseTokenTestCase
      String out = TestHyphenatedWordsFilter.tsToString( stream );
      System.out.println( test[2] + " ==> " + out );
      
-      assertEquals("pattern: "+test[2], test[3], out );
+      assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );
      
      // Make sure it is the same as if we called 'split'
-      if( "-1".equals( test[0] ) ) {
+      // test disabled, as we remove empty tokens
+      /*if( "-1".equals( test[0] ) ) {
        String[] split = test[2].split( test[1] );
        stream = tokenizer.create( new StringReader( test[2] ) );
        int i=0;
@ -71,7 +72,7 @@ public class TestPatternTokenizerFactory extends BaseTokenTestCase
        {
          assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) );
        }
-      }
+      }*/
    } 
 	}
 	
@ -96,5 +97,16 @@ public class TestPatternTokenizerFactory extends BaseTokenTestCase
    List<Token> result = getTokens( stream );
    List<Token> expect = tokens( "Günther,1,0,12 Günther,1,13,25 is,1,26,28 here,1,29,33" );
    assertTokEqualOff( expect, result );
+    
+    charStream.reset();
+    args.put( PatternTokenizerFactory.PATTERN, "Günther" );
+    args.put( PatternTokenizerFactory.GROUP, "0" );
+    tokFactory = new PatternTokenizerFactory();
+    tokFactory.init( args );
+    stream = tokFactory.create( charStream );
+
+    result = getTokens( stream );
+    expect = tokens( "Günther,1,0,12 Günther,1,13,25" );
+    assertTokEqualOff( expect, result );
  }
 }