LUCENE-4810: don't increment position on every gram (only the first, for a given input token) in EdgeNGramTokenizer/Filter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1453937 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-03-07 16:07:23 +00:00
parent 917abf2873
commit b16b38118e
4 changed files with 53 additions and 3 deletions

View File

@ -19,6 +19,14 @@ Changes in backwards compatibility policy
(Nikola Tanković, Uwe Schindler, Chris Male, Mike McCandless,
Robert Muir)
======================= Lucene 4.3.0 =======================
Changes in backwards compatibility policy
* LUCENE-4810: EdgeNGramTokenFilter no longer increments position for
multiple ngrams derived from the same input token. (Walter Underwood
via Mike McCandless)
======================= Lucene 4.2.0 =======================
Changes in backwards compatibility policy

View File

@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import java.io.IOException;
@ -73,9 +74,11 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
private int tokStart;
private int tokEnd; // only used if the length changed before this filter
private boolean hasIllegalOffsets; // only if the length changed before this filter
private int savePosIncr;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
/**
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
@ -132,6 +135,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
savePosIncr = posIncrAtt.getPositionIncrement();
}
}
if (curGramSize <= maxGram) {
@ -146,6 +150,12 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
} else {
offsetAtt.setOffset(tokStart + start, tokStart + end);
}
// first ngram gets increment, others don't
if (curGramSize == minGram) {
posIncrAtt.setPositionIncrement(savePosIncr);
} else {
posIncrAtt.setPositionIncrement(0);
}
termAtt.copyBuffer(curTermBuffer, start, curGramSize);
curGramSize++;
return true;

View File

@ -17,14 +17,15 @@ package org.apache.lucene.analysis.ngram;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
import java.io.IOException;
import java.io.Reader;
/**
* Tokenizes the input from an edge into n-grams of given size(s).
* <p>
@ -39,6 +40,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
/** Specifies which side of the input the n-gram should be generated from */
public static enum Side {
@ -214,6 +216,9 @@ public final class EdgeNGramTokenizer extends Tokenizer {
if (inLen == 0) {
return false;
}
posIncrAtt.setPositionIncrement(1);
} else {
posIncrAtt.setPositionIncrement(0);
}
// if the remaining input is too short, we can't generate any n-grams

View File

@ -105,6 +105,33 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
null,
false);
}
public void testFilterPositions() throws Exception {
TokenStream ts = new MockTokenizer(new StringReader("abcde vwxyz"), MockTokenizer.WHITESPACE, false);
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
assertTokenStreamContents(tokenizer,
new String[]{"a","ab","abc","v","vw","vwx"},
new int[]{0,0,0,6,6,6},
new int[]{1,2,3,7,8,9},
null,
new int[]{1,0,0,1,0,0},
null,
null,
false);
}
public void testTokenizerPositions() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(new StringReader("abcde"), EdgeNGramTokenizer.Side.FRONT, 1, 3);
assertTokenStreamContents(tokenizer,
new String[]{"a","ab","abc"},
new int[]{0,0,0},
new int[]{1,2,3},
null,
new int[]{1,0,0},
null,
null,
false);
}
public void testSmallTokenInStream() throws Exception {
input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);