diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 58201d62919..530b0d40da9 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -61,6 +61,13 @@ Other * LUCENE-7360: Remove Explanation.toHtml() (Alan Woodward) +======================= Lucene 6.5.0 ======================= + +Bug Fixes + +* LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads + and preserve all attributes. (Nathan Gass via Uwe Schindler) + ======================= Lucene 6.4.0 ======================= API Changes diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java index 827e26ffda1..47b80ffad1a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java @@ -22,9 +22,8 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.AttributeSource.State; /** * Tokenizes the given token into n-grams of given size(s). @@ -43,15 +42,11 @@ public final class EdgeNGramTokenFilter extends TokenFilter { private int curTermLength; private int curCodePointCount; private int curGramSize; - private int tokStart; - private int tokEnd; // only used if the length changed before this filter private int savePosIncr; - private int savePosLen; + private State state; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); /** * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range @@ -86,17 +81,14 @@ public final class EdgeNGramTokenFilter extends TokenFilter { curTermLength = termAtt.length(); curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); curGramSize = minGram; - tokStart = offsetAtt.startOffset(); - tokEnd = offsetAtt.endOffset(); + state = captureState(); savePosIncr += posIncrAtt.getPositionIncrement(); - savePosLen = posLenAtt.getPositionLength(); } } if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams // grab gramSize chars from front or back - clearAttributes(); - offsetAtt.setOffset(tokStart, tokEnd); + restoreState(state); // first ngram gets increment, others don't if (curGramSize == minGram) { posIncrAtt.setPositionIncrement(savePosIncr); @@ -104,7 +96,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter { } else { posIncrAtt.setPositionIncrement(0); } - posLenAtt.setPositionLength(savePosLen); final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize); termAtt.copyBuffer(curTermBuffer, 0, charLength); curGramSize++; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java index e275cfa88f5..cb5d4474e25 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java @@ -23,9 +23,8 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.AttributeSource.State; /** * Tokenizes the input into n-grams of the given size(s). @@ -52,14 +51,11 @@ public final class NGramTokenFilter extends TokenFilter { private int curCodePointCount; private int curGramSize; private int curPos; - private int curPosInc, curPosLen; - private int tokStart; - private int tokEnd; + private int curPosInc; + private State state; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncAtt; - private final PositionLengthAttribute posLenAtt; - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); /** * Creates NGramTokenFilter with given min and max n-grams. @@ -79,7 +75,6 @@ public final class NGramTokenFilter extends TokenFilter { this.maxGram = maxGram; posIncAtt = addAttribute(PositionIncrementAttribute.class); - posLenAtt = addAttribute(PositionLengthAttribute.class); } /** @@ -104,9 +99,7 @@ public final class NGramTokenFilter extends TokenFilter { curGramSize = minGram; curPos = 0; curPosInc = posIncAtt.getPositionIncrement(); - curPosLen = posLenAtt.getPositionLength(); - tokStart = offsetAtt.startOffset(); - tokEnd = offsetAtt.endOffset(); + state = captureState(); } } @@ -115,14 +108,12 @@ public final class NGramTokenFilter extends TokenFilter { curGramSize = minGram; } if ((curPos + curGramSize) <= curCodePointCount) { - clearAttributes(); + restoreState(state); final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); termAtt.copyBuffer(curTermBuffer, start, end - start); posIncAtt.setPositionIncrement(curPosInc); curPosInc = 0; - posLenAtt.setPositionLength(curPosLen); - offsetAtt.setOffset(tokStart, tokEnd); curGramSize++; return true; } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java index 12433526f6e..5de532f4c09 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java @@ -22,7 +22,10 @@ import java.io.StringReader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.payloads.PayloadHelper; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; +import org.apache.lucene.util.BytesRef; /** * Simple tests to ensure the NGram filter factories are working. @@ -76,6 +79,28 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { new String[] { "te", "tes", "es", "est", "st" }); } + /** + * Test NGramFilterFactory on tokens with payloads + */ + public void testNGramFilterPayload() throws Exception { + Reader reader = new StringReader("test|0.1"); + TokenStream stream = whitespaceMockTokenizer(reader); + stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream); + stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream); + + stream.reset(); + while (stream.incrementToken()) { + PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class); + assertNotNull(payAttr); + BytesRef payData = payAttr.getPayload(); + assertNotNull(payData); + float payFloat = PayloadHelper.decodeFloat(payData.bytes); + assertEquals(0.1f, payFloat, 0.0f); + } + stream.end(); + stream.close(); + } + /** * Test EdgeNGramTokenizerFactory */ @@ -123,6 +148,28 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { assertTokenStreamContents(stream, new String[] { "t", "te" }); } + + /** + * Test EdgeNGramFilterFactory on tokens with payloads + */ + public void testEdgeNGramFilterPayload() throws Exception { + Reader reader = new StringReader("test|0.1"); + TokenStream stream = whitespaceMockTokenizer(reader); + stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream); + stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream); + + stream.reset(); + while (stream.incrementToken()) { + PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class); + assertNotNull(payAttr); + BytesRef payData = payAttr.getPayload(); + assertNotNull(payData); + float payFloat = PayloadHelper.decodeFloat(payData.bytes); + assertEquals(0.1f, payFloat, 0.0f); + } + stream.end(); + stream.close(); + } /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception {