SOLR-1360: Prevent PhoneticFilter from producing duplicate tokens, update to new attribute API

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@803977 13f79535-47bb-0310-9956-ffa450edef68
2009-08-13 18:18:22 +00:00 · 2009-08-13 18:18:22 +00:00 · 6b0f99d6ad
parent 551e52f899
commit 6b0f99d6ad
3 changed files with 62 additions and 23 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -463,6 +463,10 @@ Bug Fixes
 55. SOLR-1342: CapitalizationFilterFactory uses incorrect term length calculations.
    (Robert Muir via Mark Miller)

+56. SOLR-1359: DoubleMetaphoneFilter didn't index original tokens if there was no
+    alternative, and could incorrectly skip or reorder tokens.  (yonik)
+
+57. SOLR-1360: Prevent PhoneticFilter from producing duplicate tokens. (yonik)

 Other Changes
 ----------------------
--- a/src/java/org/apache/solr/analysis/PhoneticFilter.java
+++ b/src/java/org/apache/solr/analysis/PhoneticFilter.java
@ -21,6 +21,8 @@ import org.apache.commons.codec.Encoder;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

 import java.io.IOException;

@ -36,39 +38,63 @@ public class PhoneticFilter extends TokenFilter
  protected Encoder encoder = null;
  protected String name = null;
  
-  protected Token save = null;
+  protected State save = null;
+  private final TermAttribute termAtt;
+  private final PositionIncrementAttribute posAtt;

  public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) {
    super(in);
    this.encoder = encoder;
    this.name = name;
    this.inject = inject;
+    this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+    this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);    
  }

  @Override
-  public final Token next(Token in) throws IOException {
+  public boolean incrementToken() throws IOException {
    if( save != null ) {
-      Token temp = save;
+      clearAttributes(); restoreState(save);
      save = null;
-      return temp;
+      return true;
    }

-    Token t = input.next(in);
-    if( t != null ) {
-      String value = new String(t.termBuffer(), 0, t.termLength());
+    if (!input.incrementToken()) return false;
+
+    // pass through zero-length terms
+    if (termAtt.termLength()==0) return true;
+
+    String value = termAtt.term();
+    String phonetic = null;
    try {
-        value = encoder.encode(value).toString();
+     String v = encoder.encode(value).toString();
+     if (v.length() > 0 && !value.equals(v)) phonetic = v;
+    } catch (Exception ignored) {} // just use the direct text
+
+    if (phonetic == null) return true;
+
+    if (!inject) {
+      // just modify this token
+      termAtt.setTermBuffer(phonetic);
+      return true;
    }
-      catch (Exception ignored) {} // just use the direct text
-      //Token m = new Token(value, t.startOffset(), t.endOffset(), name );
-      if( inject ) {
-        save = (Token) t.clone();
-        save.setPositionIncrement(0);
-        save.setTermBuffer(value.toCharArray(), 0, value.length());
-      } else {
-        t.setTermBuffer(value.toCharArray(), 0, value.length());
-      }
-    }
-    return t;
+
+    // We need to return both the original and the phonetic tokens.
+    // to avoid a orig=captureState() change_to_phonetic() saved=captureState()  restoreState(orig)
+    // we return the phonetic alternative first
+
+    int origOffset = posAtt.getPositionIncrement();
+    posAtt.setPositionIncrement(0);
+    save = captureState();
+
+    posAtt.setPositionIncrement(origOffset);
+    termAtt.setTermBuffer(phonetic);
+    return true;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    input.reset();
+    save = null;
  }
 }
--- a/src/test/org/apache/solr/analysis/TestPhoneticFilter.java
+++ b/src/test/org/apache/solr/analysis/TestPhoneticFilter.java
@ -71,17 +71,26 @@ public class TestPhoneticFilter extends BaseTokenTestCase {
    ArrayList<Token> output = new ArrayList<Token>();
    for( String s : input ) {
      stream.add( new Token( s, 0, s.length() ) );
+
+      // phonetic token is added first in the current impl
+      output.add( new Token( enc.encode(s).toString(), 0, s.length() ) );
+
+      // add the original if applicable
      if( inject ) {
        output.add( new Token( s, 0, s.length() ) );
      }
-      output.add( new Token( enc.encode(s).toString(), 0, s.length() ) );
    }

+    // System.out.println("###stream="+stream);
+    // System.out.println("###output="+output);
+
    PhoneticFilter filter = new PhoneticFilter( 
        new IterTokenStream(stream.iterator()), enc, "text", inject );
    
    for( Token t : output ) {
      Token got = filter.next(t);
+      // System.out.println("##### got="+got);
+
      assertEquals( new String(t.termBuffer(), 0, t.termLength()), new String(got.termBuffer(), 0, got.termLength()));
    }
    assertNull( filter.next() );  // no more tokens