From 61e45283061ae486acc5882c5a770025c8291222 Mon Sep 17 00:00:00 2001 From: Nathan Gass Date: Mon, 9 Jan 2017 14:59:31 +0100 Subject: [PATCH 1/5] add test that EdgeNGram filter keeps payloads --- .../analysis/ngram/TestNGramFilters.java | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java index 12433526f6e..b6f4405555f 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java @@ -22,7 +22,10 @@ import java.io.StringReader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.payloads.PayloadHelper; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; +import org.apache.lucene.util.BytesRef; /** * Simple tests to ensure the NGram filter factories are working. @@ -123,6 +126,25 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { assertTokenStreamContents(stream, new String[] { "t", "te" }); } + + public void testEdgeNGramFilterPayload() throws Exception { + Reader reader = new StringReader("test|0.1"); + TokenStream stream = whitespaceMockTokenizer(reader); + stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream); + stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream); + + stream.reset(); + while (stream.incrementToken()) { + PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class); + assertNotNull(payAttr); + BytesRef payData = payAttr.getPayload(); + assertNotNull(payData); + float payFloat = PayloadHelper.decodeFloat(payData.bytes); + assertEquals(0.1f, payFloat, 0.0f); + } + stream.end(); + stream.close(); + } /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { From 6570e6ecc2b14a28da9873948083791ba47145d0 Mon Sep 17 00:00:00 2001 From: Nathan Gass Date: Mon, 9 Jan 2017 15:00:21 +0100 Subject: [PATCH 2/5] copy all attributes including payload to new tokens --- .../analysis/ngram/EdgeNGramTokenFilter.java | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java index 827e26ffda1..303b7e3201d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java @@ -22,9 +22,8 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.AttributeSource; /** * Tokenizes the given token into n-grams of given size(s). @@ -43,15 +42,11 @@ public final class EdgeNGramTokenFilter extends TokenFilter { private int curTermLength; private int curCodePointCount; private int curGramSize; - private int tokStart; - private int tokEnd; // only used if the length changed before this filter private int savePosIncr; - private int savePosLen; + private AttributeSource attributes; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); /** * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range @@ -86,17 +81,15 @@ public final class EdgeNGramTokenFilter extends TokenFilter { curTermLength = termAtt.length(); curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); curGramSize = minGram; - tokStart = offsetAtt.startOffset(); - tokEnd = offsetAtt.endOffset(); + attributes = input.cloneAttributes(); savePosIncr += posIncrAtt.getPositionIncrement(); - savePosLen = posLenAtt.getPositionLength(); } } if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams // grab gramSize chars from front or back clearAttributes(); - offsetAtt.setOffset(tokStart, tokEnd); + attributes.copyTo(this); // first ngram gets increment, others don't if (curGramSize == minGram) { posIncrAtt.setPositionIncrement(savePosIncr); @@ -104,7 +97,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter { } else { posIncrAtt.setPositionIncrement(0); } - posLenAtt.setPositionLength(savePosLen); final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize); termAtt.copyBuffer(curTermBuffer, 0, charLength); curGramSize++; From 01f2a87c67392a86b533d0c76ba7666845d1945f Mon Sep 17 00:00:00 2001 From: Nathan Gass Date: Fri, 13 Jan 2017 15:54:07 +0100 Subject: [PATCH 3/5] use captureState and restoreState instead of cloneAttributes --- .../lucene/analysis/ngram/EdgeNGramTokenFilter.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java index 303b7e3201d..df12fda3e0e 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java @@ -23,7 +23,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.AttributeSource.State; /** * Tokenizes the given token into n-grams of given size(s). @@ -43,7 +43,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter { private int curCodePointCount; private int curGramSize; private int savePosIncr; - private AttributeSource attributes; + private State state; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); @@ -81,15 +81,14 @@ public final class EdgeNGramTokenFilter extends TokenFilter { curTermLength = termAtt.length(); curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); curGramSize = minGram; - attributes = input.cloneAttributes(); + state = captureState(); savePosIncr += posIncrAtt.getPositionIncrement(); } } if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams // grab gramSize chars from front or back - clearAttributes(); - attributes.copyTo(this); + restoreState(state); // first ngram gets increment, others don't if (curGramSize == minGram) { posIncrAtt.setPositionIncrement(savePosIncr); From 80e2854247cce485920b45acdeffa3e68bcea385 Mon Sep 17 00:00:00 2001 From: Nathan Gass Date: Fri, 13 Jan 2017 16:42:41 +0100 Subject: [PATCH 4/5] add comment and test for ngram token filter --- .../analysis/ngram/TestNGramFilters.java | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java index b6f4405555f..5de532f4c09 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java @@ -79,6 +79,28 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { new String[] { "te", "tes", "es", "est", "st" }); } + /** + * Test NGramFilterFactory on tokens with payloads + */ + public void testNGramFilterPayload() throws Exception { + Reader reader = new StringReader("test|0.1"); + TokenStream stream = whitespaceMockTokenizer(reader); + stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream); + stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream); + + stream.reset(); + while (stream.incrementToken()) { + PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class); + assertNotNull(payAttr); + BytesRef payData = payAttr.getPayload(); + assertNotNull(payData); + float payFloat = PayloadHelper.decodeFloat(payData.bytes); + assertEquals(0.1f, payFloat, 0.0f); + } + stream.end(); + stream.close(); + } + /** * Test EdgeNGramTokenizerFactory */ @@ -127,6 +149,9 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { new String[] { "t", "te" }); } + /** + * Test EdgeNGramFilterFactory on tokens with payloads + */ public void testEdgeNGramFilterPayload() throws Exception { Reader reader = new StringReader("test|0.1"); TokenStream stream = whitespaceMockTokenizer(reader); From ea049b96a24d6afc582ecdf406e8bf256b9911d9 Mon Sep 17 00:00:00 2001 From: Nathan Gass Date: Fri, 13 Jan 2017 17:01:34 +0100 Subject: [PATCH 5/5] also copy all attributes for ngram token filters --- .../analysis/ngram/NGramTokenFilter.java | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java index e275cfa88f5..cb5d4474e25 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java @@ -23,9 +23,8 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.AttributeSource.State; /** * Tokenizes the input into n-grams of the given size(s). @@ -52,14 +51,11 @@ public final class NGramTokenFilter extends TokenFilter { private int curCodePointCount; private int curGramSize; private int curPos; - private int curPosInc, curPosLen; - private int tokStart; - private int tokEnd; + private int curPosInc; + private State state; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncAtt; - private final PositionLengthAttribute posLenAtt; - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); /** * Creates NGramTokenFilter with given min and max n-grams. @@ -79,7 +75,6 @@ public final class NGramTokenFilter extends TokenFilter { this.maxGram = maxGram; posIncAtt = addAttribute(PositionIncrementAttribute.class); - posLenAtt = addAttribute(PositionLengthAttribute.class); } /** @@ -104,9 +99,7 @@ public final class NGramTokenFilter extends TokenFilter { curGramSize = minGram; curPos = 0; curPosInc = posIncAtt.getPositionIncrement(); - curPosLen = posLenAtt.getPositionLength(); - tokStart = offsetAtt.startOffset(); - tokEnd = offsetAtt.endOffset(); + state = captureState(); } } @@ -115,14 +108,12 @@ public final class NGramTokenFilter extends TokenFilter { curGramSize = minGram; } if ((curPos + curGramSize) <= curCodePointCount) { - clearAttributes(); + restoreState(state); final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); termAtt.copyBuffer(curTermBuffer, start, end - start); posIncAtt.setPositionIncrement(curPosInc); curPosInc = 0; - posLenAtt.setPositionLength(curPosLen); - offsetAtt.setOffset(tokStart, tokEnd); curGramSize++; return true; }