LUCENE-4993: Fix BeiderMorseFilter to preserve custom attributes when inserting tokens with position increment 0.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1480911 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2013-05-10 08:00:19 +00:00
parent d9140ea03d
commit 15317f5200
3 changed files with 29 additions and 9 deletions

View File

@ -124,6 +124,9 @@ Bug Fixes
* LUCENE-4994: Fix PatternKeywordMarkerFilter to have public constructor.
(Uwe Schindler)
* LUCENE-4993: Fix BeiderMorseFilter to preserve custom attributes when
inserting tokens with position increment 0. (Uwe Schindler)
Optimizations
* LUCENE-4938: Don't use an unnecessarily large priority queue in IndexSearcher

View File

@ -27,7 +27,6 @@ import org.apache.commons.codec.language.bm.PhoneticEngine;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
@ -48,13 +47,11 @@ public final class BeiderMorseFilter extends TokenFilter {
private final Matcher matcher = pattern.matcher("");
// encoded representation
private String encoded;
// offsets for any buffered outputs
private int startOffset;
private int endOffset;
// preserves all attributes for any buffered outputs
private State state;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/**
@ -83,10 +80,10 @@ public final class BeiderMorseFilter extends TokenFilter {
@Override
public boolean incrementToken() throws IOException {
if (matcher.find()) {
clearAttributes();
assert state != null && encoded != null;
restoreState(state);
termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1));
posIncAtt.setPositionIncrement(0);
offsetAtt.setOffset(startOffset, endOffset);
return true;
}
@ -94,8 +91,7 @@ public final class BeiderMorseFilter extends TokenFilter {
encoded = (languages == null)
? engine.encode(termAtt.toString())
: engine.encode(termAtt.toString(), languages);
startOffset = offsetAtt.startOffset();
endOffset = offsetAtt.endOffset();
state = captureState();
matcher.reset(encoded);
if (matcher.find()) {
termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1));

View File

@ -19,7 +19,9 @@ package org.apache.lucene.analysis.phonetic;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashSet;
import java.util.regex.Pattern;
import org.apache.commons.codec.language.bm.NameType;
import org.apache.commons.codec.language.bm.PhoneticEngine;
@ -29,7 +31,10 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.PatternKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.junit.Ignore;
/** Tests {@link BeiderMorseFilter} */
@ -103,4 +108,20 @@ public class TestBeiderMorseFilter extends BaseTokenStreamTestCase {
};
checkOneTermReuse(a, "", "");
}
public void testCustomAttribute() throws IOException {
TokenStream stream = new KeywordTokenizer(new StringReader("D'Angelo"));
stream = new PatternKeywordMarkerFilter(stream, Pattern.compile(".*"));
stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true));
KeywordAttribute keyAtt = stream.addAttribute(KeywordAttribute.class);
stream.reset();
int i = 0;
while(stream.incrementToken()) {
assertTrue(keyAtt.isKeyword());
i++;
}
assertEquals(12, i);
stream.end();
stream.close();
}
}