mirror of https://github.com/apache/lucene.git
LUCENE-4810: position increment for first output token from EdgeNGramTokenFilter must be > 0
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1470496 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
44a400f99a
commit
ca4e843716
|
@ -75,6 +75,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
private int tokEnd; // only used if the length changed before this filter
|
||||
private boolean hasIllegalOffsets; // only if the length changed before this filter
|
||||
private int savePosIncr;
|
||||
private boolean isFirstToken = true;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
@ -138,9 +139,8 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
savePosIncr = posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
if (curGramSize <= maxGram) {
|
||||
if (! (curGramSize > curTermLength // if the remaining input is too short, we can't generate any n-grams
|
||||
|| curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit
|
||||
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
|
||||
if (curGramSize <= curTermLength) { // if the remaining input is too short, we can't generate any n-grams
|
||||
// grab gramSize chars from front or back
|
||||
int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
|
||||
int end = start + curGramSize;
|
||||
|
@ -152,12 +152,16 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
}
|
||||
// first ngram gets increment, others don't
|
||||
if (curGramSize == minGram) {
|
||||
posIncrAtt.setPositionIncrement(savePosIncr);
|
||||
// Leave the first token position increment at the cleared-attribute value of 1
|
||||
if ( ! isFirstToken) {
|
||||
posIncrAtt.setPositionIncrement(savePosIncr);
|
||||
}
|
||||
} else {
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
}
|
||||
termAtt.copyBuffer(curTermBuffer, start, curGramSize);
|
||||
curGramSize++;
|
||||
isFirstToken = false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -169,5 +173,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
curTermBuffer = null;
|
||||
isFirstToken = true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
||||
import org.apache.lucene.analysis.position.PositionFilter;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
@ -120,6 +121,21 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
false);
|
||||
}
|
||||
|
||||
public void testFirstTokenPositionIncrement() throws Exception {
|
||||
TokenStream ts = new MockTokenizer(new StringReader("a abc"), MockTokenizer.WHITESPACE, false);
|
||||
ts = new PositionFilter(ts, 0); // All but first token will get 0 position increment
|
||||
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.FRONT, 2, 3);
|
||||
// The first token "a" will not be output, since it's smaller than the mingram size of 2.
|
||||
// The second token on input to EdgeNGramTokenFilter will have position increment of 0,
|
||||
// which should be increased to 1, since this is the first output token in the stream.
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] { "ab", "abc" },
|
||||
new int[] { 2, 2 },
|
||||
new int[] { 4, 5 },
|
||||
new int[] { 1, 0 }
|
||||
);
|
||||
}
|
||||
|
||||
public void testTokenizerPositions() throws Exception {
|
||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(new StringReader("abcde"), EdgeNGramTokenizer.Side.FRONT, 1, 3);
|
||||
assertTokenStreamContents(tokenizer,
|
||||
|
|
|
@ -52,6 +52,14 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
|||
new int[] { 0, 0, 0, 0, 7, 7, 7, 7 },
|
||||
new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
|
||||
new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });
|
||||
|
||||
assertAnalyzesToReuse(
|
||||
a,
|
||||
"T. Gl\u00FCcksberg",
|
||||
new String[] { "to", "tom", "tona", "Gl\u00FCcksberg" },
|
||||
new int[] { 0, 0, 0, 3 },
|
||||
new int[] { 1, 1, 1, 13 },
|
||||
new int[] { 1, 0, 0, 1 });
|
||||
}
|
||||
|
||||
/** Test reuse of MorfologikFilter with leftover stems. */
|
||||
|
|
Loading…
Reference in New Issue