diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 561a9ae79a0..537a28e6972 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -19,6 +19,14 @@ Changes in backwards compatibility policy (Nikola Tanković, Uwe Schindler, Chris Male, Mike McCandless, Robert Muir) +======================= Lucene 4.3.0 ======================= + +Changes in backwards compatibility policy + +* LUCENE-4810: EdgeNGramTokenFilter no longer increments position for + multiple ngrams derived from the same input token. (Walter Underwood + via Mike McCandless) + ======================= Lucene 4.2.0 ======================= Changes in backwards compatibility policy diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java index 94ee613d9d0..71de66ed5c5 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import java.io.IOException; @@ -73,9 +74,11 @@ public final class EdgeNGramTokenFilter extends TokenFilter { private int tokStart; private int tokEnd; // only used if the length changed before this filter private boolean hasIllegalOffsets; // only if the length changed before this filter + private int savePosIncr; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); /** * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range @@ -132,6 +135,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter { // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + curTermLength) != tokEnd; + savePosIncr = posIncrAtt.getPositionIncrement(); } } if (curGramSize <= maxGram) { @@ -146,6 +150,12 @@ public final class EdgeNGramTokenFilter extends TokenFilter { } else { offsetAtt.setOffset(tokStart + start, tokStart + end); } + // first ngram gets increment, others don't + if (curGramSize == minGram) { + posIncrAtt.setPositionIncrement(savePosIncr); + } else { + posIncrAtt.setPositionIncrement(0); + } termAtt.copyBuffer(curTermBuffer, start, curGramSize); curGramSize++; return true; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java index b5163ae910e..788c9fd59f2 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java @@ -17,14 +17,15 @@ package org.apache.lucene.analysis.ngram; * limitations under the License. */ +import java.io.IOException; +import java.io.Reader; + import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.AttributeSource; -import java.io.IOException; -import java.io.Reader; - /** * Tokenizes the input from an edge into n-grams of given size(s). *

@@ -39,6 +40,7 @@ public final class EdgeNGramTokenizer extends Tokenizer { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); /** Specifies which side of the input the n-gram should be generated from */ public static enum Side { @@ -214,6 +216,9 @@ public final class EdgeNGramTokenizer extends Tokenizer { if (inLen == 0) { return false; } + posIncrAtt.setPositionIncrement(1); + } else { + posIncrAtt.setPositionIncrement(0); } // if the remaining input is too short, we can't generate any n-grams diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java index 62c7ca119e8..7446793cf04 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java @@ -105,6 +105,33 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { null, false); } + + public void testFilterPositions() throws Exception { + TokenStream ts = new MockTokenizer(new StringReader("abcde vwxyz"), MockTokenizer.WHITESPACE, false); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.FRONT, 1, 3); + assertTokenStreamContents(tokenizer, + new String[]{"a","ab","abc","v","vw","vwx"}, + new int[]{0,0,0,6,6,6}, + new int[]{1,2,3,7,8,9}, + null, + new int[]{1,0,0,1,0,0}, + null, + null, + false); + } + + public void testTokenizerPositions() throws Exception { + EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(new StringReader("abcde"), EdgeNGramTokenizer.Side.FRONT, 1, 3); + assertTokenStreamContents(tokenizer, + new String[]{"a","ab","abc"}, + new int[]{0,0,0}, + new int[]{1,2,3}, + null, + new int[]{1,0,0}, + null, + null, + false); + } public void testSmallTokenInStream() throws Exception { input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);