also copy all attributes for ngram token filters

This commit is contained in:
Nathan Gass 2017-01-13 17:01:34 +01:00
parent 80e2854247
commit ea049b96a2
1 changed files with 5 additions and 14 deletions

View File

@ -23,9 +23,8 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter; import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.util.AttributeSource.State;
/** /**
* Tokenizes the input into n-grams of the given size(s). * Tokenizes the input into n-grams of the given size(s).
@ -52,14 +51,11 @@ public final class NGramTokenFilter extends TokenFilter {
private int curCodePointCount; private int curCodePointCount;
private int curGramSize; private int curGramSize;
private int curPos; private int curPos;
private int curPosInc, curPosLen; private int curPosInc;
private int tokStart; private State state;
private int tokEnd;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt; private final PositionIncrementAttribute posIncAtt;
private final PositionLengthAttribute posLenAtt;
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/** /**
* Creates NGramTokenFilter with given min and max n-grams. * Creates NGramTokenFilter with given min and max n-grams.
@ -79,7 +75,6 @@ public final class NGramTokenFilter extends TokenFilter {
this.maxGram = maxGram; this.maxGram = maxGram;
posIncAtt = addAttribute(PositionIncrementAttribute.class); posIncAtt = addAttribute(PositionIncrementAttribute.class);
posLenAtt = addAttribute(PositionLengthAttribute.class);
} }
/** /**
@ -104,9 +99,7 @@ public final class NGramTokenFilter extends TokenFilter {
curGramSize = minGram; curGramSize = minGram;
curPos = 0; curPos = 0;
curPosInc = posIncAtt.getPositionIncrement(); curPosInc = posIncAtt.getPositionIncrement();
curPosLen = posLenAtt.getPositionLength(); state = captureState();
tokStart = offsetAtt.startOffset();
tokEnd = offsetAtt.endOffset();
} }
} }
@ -115,14 +108,12 @@ public final class NGramTokenFilter extends TokenFilter {
curGramSize = minGram; curGramSize = minGram;
} }
if ((curPos + curGramSize) <= curCodePointCount) { if ((curPos + curGramSize) <= curCodePointCount) {
clearAttributes(); restoreState(state);
final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
termAtt.copyBuffer(curTermBuffer, start, end - start); termAtt.copyBuffer(curTermBuffer, start, end - start);
posIncAtt.setPositionIncrement(curPosInc); posIncAtt.setPositionIncrement(curPosInc);
curPosInc = 0; curPosInc = 0;
posLenAtt.setPositionLength(curPosLen);
offsetAtt.setOffset(tokStart, tokEnd);
curGramSize++; curGramSize++;
return true; return true;
} }