mirror of https://github.com/apache/lucene.git
LUCENE-4993: Fix BeiderMorseFilter to preserve custom attributes when inserting tokens with position increment 0.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1480911 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d9140ea03d
commit
15317f5200
|
@ -124,6 +124,9 @@ Bug Fixes
|
|||
* LUCENE-4994: Fix PatternKeywordMarkerFilter to have public constructor.
|
||||
(Uwe Schindler)
|
||||
|
||||
* LUCENE-4993: Fix BeiderMorseFilter to preserve custom attributes when
|
||||
inserting tokens with position increment 0. (Uwe Schindler)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-4938: Don't use an unnecessarily large priority queue in IndexSearcher
|
||||
|
|
|
@ -27,7 +27,6 @@ import org.apache.commons.codec.language.bm.PhoneticEngine;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
/**
|
||||
|
@ -48,13 +47,11 @@ public final class BeiderMorseFilter extends TokenFilter {
|
|||
private final Matcher matcher = pattern.matcher("");
|
||||
// encoded representation
|
||||
private String encoded;
|
||||
// offsets for any buffered outputs
|
||||
private int startOffset;
|
||||
private int endOffset;
|
||||
// preserves all attributes for any buffered outputs
|
||||
private State state;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
|
||||
/**
|
||||
|
@ -83,10 +80,10 @@ public final class BeiderMorseFilter extends TokenFilter {
|
|||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (matcher.find()) {
|
||||
clearAttributes();
|
||||
assert state != null && encoded != null;
|
||||
restoreState(state);
|
||||
termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1));
|
||||
posIncAtt.setPositionIncrement(0);
|
||||
offsetAtt.setOffset(startOffset, endOffset);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -94,8 +91,7 @@ public final class BeiderMorseFilter extends TokenFilter {
|
|||
encoded = (languages == null)
|
||||
? engine.encode(termAtt.toString())
|
||||
: engine.encode(termAtt.toString(), languages);
|
||||
startOffset = offsetAtt.startOffset();
|
||||
endOffset = offsetAtt.endOffset();
|
||||
state = captureState();
|
||||
matcher.reset(encoded);
|
||||
if (matcher.find()) {
|
||||
termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1));
|
||||
|
|
|
@ -19,7 +19,9 @@ package org.apache.lucene.analysis.phonetic;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.codec.language.bm.NameType;
|
||||
import org.apache.commons.codec.language.bm.PhoneticEngine;
|
||||
|
@ -29,7 +31,10 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.PatternKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.junit.Ignore;
|
||||
|
||||
/** Tests {@link BeiderMorseFilter} */
|
||||
|
@ -103,4 +108,20 @@ public class TestBeiderMorseFilter extends BaseTokenStreamTestCase {
|
|||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
}
|
||||
|
||||
public void testCustomAttribute() throws IOException {
|
||||
TokenStream stream = new KeywordTokenizer(new StringReader("D'Angelo"));
|
||||
stream = new PatternKeywordMarkerFilter(stream, Pattern.compile(".*"));
|
||||
stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true));
|
||||
KeywordAttribute keyAtt = stream.addAttribute(KeywordAttribute.class);
|
||||
stream.reset();
|
||||
int i = 0;
|
||||
while(stream.incrementToken()) {
|
||||
assertTrue(keyAtt.isKeyword());
|
||||
i++;
|
||||
}
|
||||
assertEquals(12, i);
|
||||
stream.end();
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue