SOLR-1360: Prevent PhoneticFilter from producing duplicate tokens, update to new attribute API

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@803977 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-08-13 18:18:22 +00:00
parent 551e52f899
commit 6b0f99d6ad
3 changed files with 62 additions and 23 deletions

View File

@ -463,6 +463,10 @@ Bug Fixes
55. SOLR-1342: CapitalizationFilterFactory uses incorrect term length calculations.
(Robert Muir via Mark Miller)
56. SOLR-1359: DoubleMetaphoneFilter didn't index original tokens if there was no
alternative, and could incorrectly skip or reorder tokens. (yonik)
57. SOLR-1360: Prevent PhoneticFilter from producing duplicate tokens. (yonik)
Other Changes
----------------------

View File

@ -21,6 +21,8 @@ import org.apache.commons.codec.Encoder;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import java.io.IOException;
@ -36,39 +38,63 @@ public class PhoneticFilter extends TokenFilter
protected Encoder encoder = null;
protected String name = null;
protected Token save = null;
protected State save = null;
private final TermAttribute termAtt;
private final PositionIncrementAttribute posAtt;
public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) {
super(in);
this.encoder = encoder;
this.name = name;
this.inject = inject;
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
}
@Override
public final Token next(Token in) throws IOException {
public boolean incrementToken() throws IOException {
if( save != null ) {
Token temp = save;
clearAttributes(); restoreState(save);
save = null;
return temp;
return true;
}
Token t = input.next(in);
if( t != null ) {
String value = new String(t.termBuffer(), 0, t.termLength());
try {
value = encoder.encode(value).toString();
}
catch (Exception ignored) {} // just use the direct text
//Token m = new Token(value, t.startOffset(), t.endOffset(), name );
if( inject ) {
save = (Token) t.clone();
save.setPositionIncrement(0);
save.setTermBuffer(value.toCharArray(), 0, value.length());
} else {
t.setTermBuffer(value.toCharArray(), 0, value.length());
}
if (!input.incrementToken()) return false;
// pass through zero-length terms
if (termAtt.termLength()==0) return true;
String value = termAtt.term();
String phonetic = null;
try {
String v = encoder.encode(value).toString();
if (v.length() > 0 && !value.equals(v)) phonetic = v;
} catch (Exception ignored) {} // just use the direct text
if (phonetic == null) return true;
if (!inject) {
// just modify this token
termAtt.setTermBuffer(phonetic);
return true;
}
return t;
// We need to return both the original and the phonetic tokens.
// to avoid a orig=captureState() change_to_phonetic() saved=captureState() restoreState(orig)
// we return the phonetic alternative first
int origOffset = posAtt.getPositionIncrement();
posAtt.setPositionIncrement(0);
save = captureState();
posAtt.setPositionIncrement(origOffset);
termAtt.setTermBuffer(phonetic);
return true;
}
@Override
public void reset() throws IOException {
input.reset();
save = null;
}
}

View File

@ -71,17 +71,26 @@ public class TestPhoneticFilter extends BaseTokenTestCase {
ArrayList<Token> output = new ArrayList<Token>();
for( String s : input ) {
stream.add( new Token( s, 0, s.length() ) );
// phonetic token is added first in the current impl
output.add( new Token( enc.encode(s).toString(), 0, s.length() ) );
// add the original if applicable
if( inject ) {
output.add( new Token( s, 0, s.length() ) );
}
output.add( new Token( enc.encode(s).toString(), 0, s.length() ) );
}
// System.out.println("###stream="+stream);
// System.out.println("###output="+output);
PhoneticFilter filter = new PhoneticFilter(
new IterTokenStream(stream.iterator()), enc, "text", inject );
for( Token t : output ) {
Token got = filter.next(t);
// System.out.println("##### got="+got);
assertEquals( new String(t.termBuffer(), 0, t.termLength()), new String(got.termBuffer(), 0, got.termLength()));
}
assertNull( filter.next() ); // no more tokens