mirror of https://github.com/apache/lucene.git
LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads and preserve all attributes
[merge branch 'edgepayloads' from Nathan Gass https://github.com/xabbu42/lucene-solr] Signed-off-by: Uwe Schindler <uschindler@apache.org>
This commit is contained in:
commit
c64a01158e
|
@ -61,6 +61,13 @@ Other
|
|||
|
||||
* LUCENE-7360: Remove Explanation.toHtml() (Alan Woodward)
|
||||
|
||||
======================= Lucene 6.5.0 =======================
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads
|
||||
and preserve all attributes. (Nathan Gass via Uwe Schindler)
|
||||
|
||||
======================= Lucene 6.4.0 =======================
|
||||
|
||||
API Changes
|
||||
|
|
|
@ -22,9 +22,8 @@ import java.io.IOException;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.util.AttributeSource.State;
|
||||
|
||||
/**
|
||||
* Tokenizes the given token into n-grams of given size(s).
|
||||
|
@ -43,15 +42,11 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
private int curTermLength;
|
||||
private int curCodePointCount;
|
||||
private int curGramSize;
|
||||
private int tokStart;
|
||||
private int tokEnd; // only used if the length changed before this filter
|
||||
private int savePosIncr;
|
||||
private int savePosLen;
|
||||
private State state;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
||||
|
@ -86,17 +81,14 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
curTermLength = termAtt.length();
|
||||
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
|
||||
curGramSize = minGram;
|
||||
tokStart = offsetAtt.startOffset();
|
||||
tokEnd = offsetAtt.endOffset();
|
||||
state = captureState();
|
||||
savePosIncr += posIncrAtt.getPositionIncrement();
|
||||
savePosLen = posLenAtt.getPositionLength();
|
||||
}
|
||||
}
|
||||
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
|
||||
if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
|
||||
// grab gramSize chars from front or back
|
||||
clearAttributes();
|
||||
offsetAtt.setOffset(tokStart, tokEnd);
|
||||
restoreState(state);
|
||||
// first ngram gets increment, others don't
|
||||
if (curGramSize == minGram) {
|
||||
posIncrAtt.setPositionIncrement(savePosIncr);
|
||||
|
@ -104,7 +96,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
} else {
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
}
|
||||
posLenAtt.setPositionLength(savePosLen);
|
||||
final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
|
||||
termAtt.copyBuffer(curTermBuffer, 0, charLength);
|
||||
curGramSize++;
|
||||
|
|
|
@ -23,9 +23,8 @@ import org.apache.lucene.analysis.TokenFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.util.AttributeSource.State;
|
||||
|
||||
/**
|
||||
* Tokenizes the input into n-grams of the given size(s).
|
||||
|
@ -52,14 +51,11 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
private int curCodePointCount;
|
||||
private int curGramSize;
|
||||
private int curPos;
|
||||
private int curPosInc, curPosLen;
|
||||
private int tokStart;
|
||||
private int tokEnd;
|
||||
private int curPosInc;
|
||||
private State state;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt;
|
||||
private final PositionLengthAttribute posLenAtt;
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
/**
|
||||
* Creates NGramTokenFilter with given min and max n-grams.
|
||||
|
@ -79,7 +75,6 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
this.maxGram = maxGram;
|
||||
|
||||
posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -104,9 +99,7 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
curGramSize = minGram;
|
||||
curPos = 0;
|
||||
curPosInc = posIncAtt.getPositionIncrement();
|
||||
curPosLen = posLenAtt.getPositionLength();
|
||||
tokStart = offsetAtt.startOffset();
|
||||
tokEnd = offsetAtt.endOffset();
|
||||
state = captureState();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -115,14 +108,12 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
curGramSize = minGram;
|
||||
}
|
||||
if ((curPos + curGramSize) <= curCodePointCount) {
|
||||
clearAttributes();
|
||||
restoreState(state);
|
||||
final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
|
||||
final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
|
||||
termAtt.copyBuffer(curTermBuffer, start, end - start);
|
||||
posIncAtt.setPositionIncrement(curPosInc);
|
||||
curPosInc = 0;
|
||||
posLenAtt.setPositionLength(curPosLen);
|
||||
offsetAtt.setOffset(tokStart, tokEnd);
|
||||
curGramSize++;
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -22,7 +22,10 @@ import java.io.StringReader;
|
|||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the NGram filter factories are working.
|
||||
|
@ -76,6 +79,28 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
|||
new String[] { "te", "tes", "es", "est", "st" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test NGramFilterFactory on tokens with payloads
|
||||
*/
|
||||
public void testNGramFilterPayload() throws Exception {
|
||||
Reader reader = new StringReader("test|0.1");
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
|
||||
stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
|
||||
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
|
||||
assertNotNull(payAttr);
|
||||
BytesRef payData = payAttr.getPayload();
|
||||
assertNotNull(payData);
|
||||
float payFloat = PayloadHelper.decodeFloat(payData.bytes);
|
||||
assertEquals(0.1f, payFloat, 0.0f);
|
||||
}
|
||||
stream.end();
|
||||
stream.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test EdgeNGramTokenizerFactory
|
||||
*/
|
||||
|
@ -123,6 +148,28 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
|
|||
assertTokenStreamContents(stream,
|
||||
new String[] { "t", "te" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test EdgeNGramFilterFactory on tokens with payloads
|
||||
*/
|
||||
public void testEdgeNGramFilterPayload() throws Exception {
|
||||
Reader reader = new StringReader("test|0.1");
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
|
||||
stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
|
||||
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
|
||||
assertNotNull(payAttr);
|
||||
BytesRef payData = payAttr.getPayload();
|
||||
assertNotNull(payData);
|
||||
float payFloat = PayloadHelper.decodeFloat(payData.bytes);
|
||||
assertEquals(0.1f, payFloat, 0.0f);
|
||||
}
|
||||
stream.end();
|
||||
stream.close();
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
|
|
Loading…
Reference in New Issue