mirror of https://github.com/apache/lucene.git
SOLR-1360: Prevent PhoneticFilter from producing duplicate tokens, update to new attribute API
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@803977 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
551e52f899
commit
6b0f99d6ad
|
@ -463,6 +463,10 @@ Bug Fixes
|
|||
55. SOLR-1342: CapitalizationFilterFactory uses incorrect term length calculations.
|
||||
(Robert Muir via Mark Miller)
|
||||
|
||||
56. SOLR-1359: DoubleMetaphoneFilter didn't index original tokens if there was no
|
||||
alternative, and could incorrectly skip or reorder tokens. (yonik)
|
||||
|
||||
57. SOLR-1360: Prevent PhoneticFilter from producing duplicate tokens. (yonik)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
|
|
@ -21,6 +21,8 @@ import org.apache.commons.codec.Encoder;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -36,39 +38,63 @@ public class PhoneticFilter extends TokenFilter
|
|||
protected Encoder encoder = null;
|
||||
protected String name = null;
|
||||
|
||||
protected Token save = null;
|
||||
protected State save = null;
|
||||
private final TermAttribute termAtt;
|
||||
private final PositionIncrementAttribute posAtt;
|
||||
|
||||
public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) {
|
||||
super(in);
|
||||
this.encoder = encoder;
|
||||
this.name = name;
|
||||
this.inject = inject;
|
||||
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Token next(Token in) throws IOException {
|
||||
public boolean incrementToken() throws IOException {
|
||||
if( save != null ) {
|
||||
Token temp = save;
|
||||
clearAttributes(); restoreState(save);
|
||||
save = null;
|
||||
return temp;
|
||||
return true;
|
||||
}
|
||||
|
||||
Token t = input.next(in);
|
||||
if( t != null ) {
|
||||
String value = new String(t.termBuffer(), 0, t.termLength());
|
||||
if (!input.incrementToken()) return false;
|
||||
|
||||
// pass through zero-length terms
|
||||
if (termAtt.termLength()==0) return true;
|
||||
|
||||
String value = termAtt.term();
|
||||
String phonetic = null;
|
||||
try {
|
||||
value = encoder.encode(value).toString();
|
||||
String v = encoder.encode(value).toString();
|
||||
if (v.length() > 0 && !value.equals(v)) phonetic = v;
|
||||
} catch (Exception ignored) {} // just use the direct text
|
||||
|
||||
if (phonetic == null) return true;
|
||||
|
||||
if (!inject) {
|
||||
// just modify this token
|
||||
termAtt.setTermBuffer(phonetic);
|
||||
return true;
|
||||
}
|
||||
catch (Exception ignored) {} // just use the direct text
|
||||
//Token m = new Token(value, t.startOffset(), t.endOffset(), name );
|
||||
if( inject ) {
|
||||
save = (Token) t.clone();
|
||||
save.setPositionIncrement(0);
|
||||
save.setTermBuffer(value.toCharArray(), 0, value.length());
|
||||
} else {
|
||||
t.setTermBuffer(value.toCharArray(), 0, value.length());
|
||||
}
|
||||
}
|
||||
return t;
|
||||
|
||||
// We need to return both the original and the phonetic tokens.
|
||||
// to avoid a orig=captureState() change_to_phonetic() saved=captureState() restoreState(orig)
|
||||
// we return the phonetic alternative first
|
||||
|
||||
int origOffset = posAtt.getPositionIncrement();
|
||||
posAtt.setPositionIncrement(0);
|
||||
save = captureState();
|
||||
|
||||
posAtt.setPositionIncrement(origOffset);
|
||||
termAtt.setTermBuffer(phonetic);
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
input.reset();
|
||||
save = null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -71,17 +71,26 @@ public class TestPhoneticFilter extends BaseTokenTestCase {
|
|||
ArrayList<Token> output = new ArrayList<Token>();
|
||||
for( String s : input ) {
|
||||
stream.add( new Token( s, 0, s.length() ) );
|
||||
|
||||
// phonetic token is added first in the current impl
|
||||
output.add( new Token( enc.encode(s).toString(), 0, s.length() ) );
|
||||
|
||||
// add the original if applicable
|
||||
if( inject ) {
|
||||
output.add( new Token( s, 0, s.length() ) );
|
||||
}
|
||||
output.add( new Token( enc.encode(s).toString(), 0, s.length() ) );
|
||||
}
|
||||
|
||||
// System.out.println("###stream="+stream);
|
||||
// System.out.println("###output="+output);
|
||||
|
||||
PhoneticFilter filter = new PhoneticFilter(
|
||||
new IterTokenStream(stream.iterator()), enc, "text", inject );
|
||||
|
||||
for( Token t : output ) {
|
||||
Token got = filter.next(t);
|
||||
// System.out.println("##### got="+got);
|
||||
|
||||
assertEquals( new String(t.termBuffer(), 0, t.termLength()), new String(got.termBuffer(), 0, got.termLength()));
|
||||
}
|
||||
assertNull( filter.next() ); // no more tokens
|
||||
|
|
Loading…
Reference in New Issue