LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads and preserve all attributes

[merge branch 'edgepayloads' from Nathan Gass https://github.com/xabbu42/lucene-solr]

Signed-off-by: Uwe Schindler <uschindler@apache.org>
This commit is contained in:
Uwe Schindler 2017-01-16 11:16:43 +01:00
commit c64a01158e
4 changed files with 63 additions and 27 deletions

View File

@ -61,6 +61,13 @@ Other
* LUCENE-7360: Remove Explanation.toHtml() (Alan Woodward)
======================= Lucene 6.5.0 =======================
Bug Fixes
* LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads
and preserve all attributes. (Nathan Gass via Uwe Schindler)
======================= Lucene 6.4.0 =======================
API Changes

View File

@ -22,9 +22,8 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.AttributeSource.State;
/**
* Tokenizes the given token into n-grams of given size(s).
@ -43,15 +42,11 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
private int curTermLength;
private int curCodePointCount;
private int curGramSize;
private int tokStart;
private int tokEnd; // only used if the length changed before this filter
private int savePosIncr;
private int savePosLen;
private State state;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
/**
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
@ -86,17 +81,14 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
curTermLength = termAtt.length();
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
curGramSize = minGram;
tokStart = offsetAtt.startOffset();
tokEnd = offsetAtt.endOffset();
state = captureState();
savePosIncr += posIncrAtt.getPositionIncrement();
savePosLen = posLenAtt.getPositionLength();
}
}
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
// grab gramSize chars from front or back
clearAttributes();
offsetAtt.setOffset(tokStart, tokEnd);
restoreState(state);
// first ngram gets increment, others don't
if (curGramSize == minGram) {
posIncrAtt.setPositionIncrement(savePosIncr);
@ -104,7 +96,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
} else {
posIncrAtt.setPositionIncrement(0);
}
posLenAtt.setPositionLength(savePosLen);
final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
termAtt.copyBuffer(curTermBuffer, 0, charLength);
curGramSize++;

View File

@ -23,9 +23,8 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.AttributeSource.State;
/**
* Tokenizes the input into n-grams of the given size(s).
@ -52,14 +51,11 @@ public final class NGramTokenFilter extends TokenFilter {
private int curCodePointCount;
private int curGramSize;
private int curPos;
private int curPosInc, curPosLen;
private int tokStart;
private int tokEnd;
private int curPosInc;
private State state;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt;
private final PositionLengthAttribute posLenAtt;
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/**
* Creates NGramTokenFilter with given min and max n-grams.
@ -79,7 +75,6 @@ public final class NGramTokenFilter extends TokenFilter {
this.maxGram = maxGram;
posIncAtt = addAttribute(PositionIncrementAttribute.class);
posLenAtt = addAttribute(PositionLengthAttribute.class);
}
/**
@ -104,9 +99,7 @@ public final class NGramTokenFilter extends TokenFilter {
curGramSize = minGram;
curPos = 0;
curPosInc = posIncAtt.getPositionIncrement();
curPosLen = posLenAtt.getPositionLength();
tokStart = offsetAtt.startOffset();
tokEnd = offsetAtt.endOffset();
state = captureState();
}
}
@ -115,14 +108,12 @@ public final class NGramTokenFilter extends TokenFilter {
curGramSize = minGram;
}
if ((curPos + curGramSize) <= curCodePointCount) {
clearAttributes();
restoreState(state);
final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
termAtt.copyBuffer(curTermBuffer, start, end - start);
posIncAtt.setPositionIncrement(curPosInc);
curPosInc = 0;
posLenAtt.setPositionLength(curPosLen);
offsetAtt.setOffset(tokStart, tokEnd);
curGramSize++;
return true;
}

View File

@ -22,7 +22,10 @@ import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
import org.apache.lucene.util.BytesRef;
/**
* Simple tests to ensure the NGram filter factories are working.
@ -76,6 +79,28 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
new String[] { "te", "tes", "es", "est", "st" });
}
/**
* Test NGramFilterFactory on tokens with payloads
*/
public void testNGramFilterPayload() throws Exception {
Reader reader = new StringReader("test|0.1");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
stream.reset();
while (stream.incrementToken()) {
PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
assertNotNull(payAttr);
BytesRef payData = payAttr.getPayload();
assertNotNull(payData);
float payFloat = PayloadHelper.decodeFloat(payData.bytes);
assertEquals(0.1f, payFloat, 0.0f);
}
stream.end();
stream.close();
}
/**
* Test EdgeNGramTokenizerFactory
*/
@ -123,6 +148,28 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
assertTokenStreamContents(stream,
new String[] { "t", "te" });
}
/**
* Test EdgeNGramFilterFactory on tokens with payloads
*/
public void testEdgeNGramFilterPayload() throws Exception {
Reader reader = new StringReader("test|0.1");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
stream.reset();
while (stream.incrementToken()) {
PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
assertNotNull(payAttr);
BytesRef payData = payAttr.getPayload();
assertNotNull(payData);
float payFloat = PayloadHelper.decodeFloat(payData.bytes);
assertEquals(0.1f, payFloat, 0.0f);
}
stream.end();
stream.close();
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {