mirror of https://github.com/apache/lucene.git
LUCENE-7668: add new test case; remove dead code; improve CannedTokenStream to copy all Token attributes
This commit is contained in:
parent
a43ef8f480
commit
72eaeab715
|
@ -27,7 +27,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
@ -173,7 +172,6 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
|||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
|
||||
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
||||
|
||||
// used for iterating word delimiter breaks
|
||||
private final WordDelimiterIterator iterator;
|
||||
|
|
|
@ -155,6 +155,19 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
|||
doSplitPossessive(0, "ra's", "ra", "s");
|
||||
}
|
||||
|
||||
public void testTokenType() throws Exception {
|
||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
// test that subwords and catenated subwords have
|
||||
// the correct offsets.
|
||||
Token token = new Token("foo-bar", 5, 12);
|
||||
token.setType("mytype");
|
||||
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(token), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||
|
||||
assertTokenStreamContents(wdf,
|
||||
new String[] {"foobar", "foo", "bar"},
|
||||
new String[] {"mytype", "mytype", "mytype"});
|
||||
}
|
||||
|
||||
/*
|
||||
* Set a large position increment gap of 10 if the token is "largegap" or "/"
|
||||
*/
|
||||
|
@ -177,7 +190,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void testPositionIncrements() throws Exception {
|
||||
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
|
||||
|
|
|
@ -18,11 +18,9 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
|
||||
/**
|
||||
* TokenStream from a canned list of Tokens.
|
||||
|
@ -30,23 +28,19 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
|||
public final class CannedTokenStream extends TokenStream {
|
||||
private final Token[] tokens;
|
||||
private int upto = 0;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final int finalOffset;
|
||||
private final int finalPosInc;
|
||||
|
||||
public CannedTokenStream(Token... tokens) {
|
||||
this.tokens = tokens;
|
||||
finalOffset = 0;
|
||||
finalPosInc = 0;
|
||||
this(0, 0, tokens);
|
||||
}
|
||||
|
||||
/** If you want trailing holes, pass a non-zero
|
||||
* finalPosInc. */
|
||||
public CannedTokenStream(int finalPosInc, int finalOffset, Token... tokens) {
|
||||
super(Token.TOKEN_ATTRIBUTE_FACTORY);
|
||||
this.tokens = tokens;
|
||||
this.finalOffset = finalOffset;
|
||||
this.finalPosInc = finalPosInc;
|
||||
|
@ -62,16 +56,10 @@ public final class CannedTokenStream extends TokenStream {
|
|||
@Override
|
||||
public boolean incrementToken() {
|
||||
if (upto < tokens.length) {
|
||||
final Token token = tokens[upto++];
|
||||
// TODO: can we just capture/restoreState so
|
||||
// we get all attrs...?
|
||||
clearAttributes();
|
||||
termAtt.setEmpty();
|
||||
termAtt.append(token.toString());
|
||||
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
|
||||
posLengthAtt.setPositionLength(token.getPositionLength());
|
||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||
payloadAtt.setPayload(token.getPayload());
|
||||
clearAttributes();
|
||||
// NOTE: this looks weird, casting offsetAtt to Token, but because we are using the Token class's AttributeFactory, all attributes are
|
||||
// in fact backed by the Token class, so we just copy the current token into our Token:
|
||||
tokens[upto++].copyTo((Token) offsetAtt);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
|
|
Loading…
Reference in New Issue