mirror of https://github.com/apache/lucene.git
LUCENE-4810: don't increment position on every gram (only the first, for a given input token) in EdgeNGramTokenizer/Filter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1453937 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
917abf2873
commit
b16b38118e
|
@ -19,6 +19,14 @@ Changes in backwards compatibility policy
|
|||
(Nikola Tanković, Uwe Schindler, Chris Male, Mike McCandless,
|
||||
Robert Muir)
|
||||
|
||||
======================= Lucene 4.3.0 =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
||||
* LUCENE-4810: EdgeNGramTokenFilter no longer increments position for
|
||||
multiple ngrams derived from the same input token. (Walter Underwood
|
||||
via Mike McCandless)
|
||||
|
||||
======================= Lucene 4.2.0 =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -73,9 +74,11 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
private int tokStart;
|
||||
private int tokEnd; // only used if the length changed before this filter
|
||||
private boolean hasIllegalOffsets; // only if the length changed before this filter
|
||||
private int savePosIncr;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
||||
|
@ -132,6 +135,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
|
||||
savePosIncr = posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
if (curGramSize <= maxGram) {
|
||||
|
@ -146,6 +150,12 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
} else {
|
||||
offsetAtt.setOffset(tokStart + start, tokStart + end);
|
||||
}
|
||||
// first ngram gets increment, others don't
|
||||
if (curGramSize == minGram) {
|
||||
posIncrAtt.setPositionIncrement(savePosIncr);
|
||||
} else {
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
}
|
||||
termAtt.copyBuffer(curTermBuffer, start, curGramSize);
|
||||
curGramSize++;
|
||||
return true;
|
||||
|
|
|
@ -17,14 +17,15 @@ package org.apache.lucene.analysis.ngram;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* Tokenizes the input from an edge into n-grams of given size(s).
|
||||
* <p>
|
||||
|
@ -39,6 +40,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
|||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
/** Specifies which side of the input the n-gram should be generated from */
|
||||
public static enum Side {
|
||||
|
@ -214,6 +216,9 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
|||
if (inLen == 0) {
|
||||
return false;
|
||||
}
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
} else {
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
}
|
||||
|
||||
// if the remaining input is too short, we can't generate any n-grams
|
||||
|
|
|
@ -105,6 +105,33 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
null,
|
||||
false);
|
||||
}
|
||||
|
||||
public void testFilterPositions() throws Exception {
|
||||
TokenStream ts = new MockTokenizer(new StringReader("abcde vwxyz"), MockTokenizer.WHITESPACE, false);
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
|
||||
assertTokenStreamContents(tokenizer,
|
||||
new String[]{"a","ab","abc","v","vw","vwx"},
|
||||
new int[]{0,0,0,6,6,6},
|
||||
new int[]{1,2,3,7,8,9},
|
||||
null,
|
||||
new int[]{1,0,0,1,0,0},
|
||||
null,
|
||||
null,
|
||||
false);
|
||||
}
|
||||
|
||||
public void testTokenizerPositions() throws Exception {
|
||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(new StringReader("abcde"), EdgeNGramTokenizer.Side.FRONT, 1, 3);
|
||||
assertTokenStreamContents(tokenizer,
|
||||
new String[]{"a","ab","abc"},
|
||||
new int[]{0,0,0},
|
||||
new int[]{1,2,3},
|
||||
null,
|
||||
new int[]{1,0,0},
|
||||
null,
|
||||
null,
|
||||
false);
|
||||
}
|
||||
|
||||
public void testSmallTokenInStream() throws Exception {
|
||||
input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
|
||||
|
|
Loading…
Reference in New Issue