mirror of https://github.com/apache/lucene.git
LUCENE-3969: when outputting a bigram token, mark posLen=2 to note that it spans two tokens
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1324727 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
64631a4309
commit
71291daa74
|
@ -16,6 +16,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -54,6 +55,7 @@ public final class CommonGramsFilter extends TokenFilter {
|
|||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
|
||||
|
||||
private int lastStartOffset;
|
||||
private boolean lastWasCommon;
|
||||
|
@ -166,6 +168,7 @@ public final class CommonGramsFilter extends TokenFilter {
|
|||
buffer.getChars(0, length, termText, 0);
|
||||
termAttribute.setLength(length);
|
||||
posIncAttribute.setPositionIncrement(0);
|
||||
posLenAttribute.setPositionLength(2); // bigram
|
||||
offsetAttribute.setOffset(lastStartOffset, endOffset);
|
||||
typeAttribute.setType(GRAM_TYPE);
|
||||
buffer.setLength(0);
|
||||
|
|
Loading…
Reference in New Issue