Fix EdgeNGramTokenFilter to correctly handle graph inputs.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1484075 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2013-05-18 09:21:46 +00:00
parent 95f0553bc0
commit 0679c7a88f
2 changed files with 27 additions and 5 deletions

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.Version;
/**
@ -43,11 +44,13 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
private int tokStart;
private int tokEnd; // only used if the length changed before this filter
private int savePosIncr;
private int savePosLen;
private boolean isFirstToken = true;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
/**
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
@ -88,7 +91,8 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
curGramSize = minGram;
tokStart = offsetAtt.startOffset();
tokEnd = offsetAtt.endOffset();
savePosIncr = posIncrAtt.getPositionIncrement();
savePosIncr += posIncrAtt.getPositionIncrement();
savePosLen = posLenAtt.getPositionLength();
}
}
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
@ -98,13 +102,12 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
offsetAtt.setOffset(tokStart, tokEnd);
// first ngram gets increment, others don't
if (curGramSize == minGram) {
// Leave the first token position increment at the cleared-attribute value of 1
if ( ! isFirstToken) {
posIncrAtt.setPositionIncrement(savePosIncr);
}
posIncrAtt.setPositionIncrement(savePosIncr);
savePosIncr = 0;
} else {
posIncrAtt.setPositionIncrement(0);
}
posLenAtt.setPositionLength(savePosLen);
termAtt.copyBuffer(curTermBuffer, 0, curGramSize);
curGramSize++;
isFirstToken = false;
@ -120,5 +123,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
super.reset();
curTermBuffer = null;
isFirstToken = true;
savePosIncr = 0;
}
}

View File

@ -29,8 +29,11 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.Version;
/**
* Tests {@link EdgeNGramTokenFilter} for correctness.
@ -187,4 +190,19 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
};
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
}
public void testGraphs() throws IOException {
TokenStream tk = new LetterTokenizer(Version.LUCENE_44, new StringReader("abc d efgh ij klmno p q"));
tk = new ShingleFilter(tk);
tk = new EdgeNGramTokenFilter(Version.LUCENE_44, tk, 7, 10);
tk.reset();
assertTokenStreamContents(tk,
new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
new int[] { 6,11,11,14 },
new int[] { 13,19,19,21 },
new int[] { 3,1,0,1 },
new int[] { 2,2,2,2 },
23
);
}
}