mirror of https://github.com/apache/lucene.git
Fix EdgeNGramTokenFilter to correctly handle graph inputs.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1484075 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
95f0553bc0
commit
0679c7a88f
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -43,11 +44,13 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
private int tokStart;
|
||||
private int tokEnd; // only used if the length changed before this filter
|
||||
private int savePosIncr;
|
||||
private int savePosLen;
|
||||
private boolean isFirstToken = true;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
||||
|
@ -88,7 +91,8 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
curGramSize = minGram;
|
||||
tokStart = offsetAtt.startOffset();
|
||||
tokEnd = offsetAtt.endOffset();
|
||||
savePosIncr = posIncrAtt.getPositionIncrement();
|
||||
savePosIncr += posIncrAtt.getPositionIncrement();
|
||||
savePosLen = posLenAtt.getPositionLength();
|
||||
}
|
||||
}
|
||||
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
|
||||
|
@ -98,13 +102,12 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
offsetAtt.setOffset(tokStart, tokEnd);
|
||||
// first ngram gets increment, others don't
|
||||
if (curGramSize == minGram) {
|
||||
// Leave the first token position increment at the cleared-attribute value of 1
|
||||
if ( ! isFirstToken) {
|
||||
posIncrAtt.setPositionIncrement(savePosIncr);
|
||||
}
|
||||
posIncrAtt.setPositionIncrement(savePosIncr);
|
||||
savePosIncr = 0;
|
||||
} else {
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
}
|
||||
posLenAtt.setPositionLength(savePosLen);
|
||||
termAtt.copyBuffer(curTermBuffer, 0, curGramSize);
|
||||
curGramSize++;
|
||||
isFirstToken = false;
|
||||
|
@ -120,5 +123,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
super.reset();
|
||||
curTermBuffer = null;
|
||||
isFirstToken = true;
|
||||
savePosIncr = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,8 +29,11 @@ import org.apache.lucene.analysis.TokenFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Tests {@link EdgeNGramTokenFilter} for correctness.
|
||||
|
@ -187,4 +190,19 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
};
|
||||
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
|
||||
}
|
||||
|
||||
public void testGraphs() throws IOException {
|
||||
TokenStream tk = new LetterTokenizer(Version.LUCENE_44, new StringReader("abc d efgh ij klmno p q"));
|
||||
tk = new ShingleFilter(tk);
|
||||
tk = new EdgeNGramTokenFilter(Version.LUCENE_44, tk, 7, 10);
|
||||
tk.reset();
|
||||
assertTokenStreamContents(tk,
|
||||
new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
|
||||
new int[] { 6,11,11,14 },
|
||||
new int[] { 13,19,19,21 },
|
||||
new int[] { 3,1,0,1 },
|
||||
new int[] { 2,2,2,2 },
|
||||
23
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue