LUCENE-7668: add new test case; remove dead code; improve CannedTokenStream to copy all Token attributes

This commit is contained in:
Mike McCandless 2017-01-31 11:56:07 -05:00
parent a43ef8f480
commit 72eaeab715
3 changed files with 22 additions and 23 deletions

View File

@ -27,7 +27,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
@ -173,7 +172,6 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
// used for iterating word delimiter breaks
private final WordDelimiterIterator iterator;

View File

@ -155,6 +155,19 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
doSplitPossessive(0, "ra's", "ra", "s");
}
public void testTokenType() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
// test that subwords and catenated subwords have
// the correct offsets.
Token token = new Token("foo-bar", 5, 12);
token.setType("mytype");
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(token), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] {"foobar", "foo", "bar"},
new String[] {"mytype", "mytype", "mytype"});
}
/*
* Set a large position increment gap of 10 if the token is "largegap" or "/"
*/
@ -177,7 +190,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
}
}
}
public void testPositionIncrements() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);

View File

@ -18,11 +18,9 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
/**
* TokenStream from a canned list of Tokens.
@ -30,23 +28,19 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
public final class CannedTokenStream extends TokenStream {
private final Token[] tokens;
private int upto = 0;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final int finalOffset;
private final int finalPosInc;
public CannedTokenStream(Token... tokens) {
this.tokens = tokens;
finalOffset = 0;
finalPosInc = 0;
this(0, 0, tokens);
}
/** If you want trailing holes, pass a non-zero
* finalPosInc. */
public CannedTokenStream(int finalPosInc, int finalOffset, Token... tokens) {
super(Token.TOKEN_ATTRIBUTE_FACTORY);
this.tokens = tokens;
this.finalOffset = finalOffset;
this.finalPosInc = finalPosInc;
@ -62,16 +56,10 @@ public final class CannedTokenStream extends TokenStream {
@Override
public boolean incrementToken() {
if (upto < tokens.length) {
final Token token = tokens[upto++];
// TODO: can we just capture/restoreState so
// we get all attrs...?
clearAttributes();
termAtt.setEmpty();
termAtt.append(token.toString());
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
posLengthAtt.setPositionLength(token.getPositionLength());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
payloadAtt.setPayload(token.getPayload());
clearAttributes();
// NOTE: this looks weird, casting offsetAtt to Token, but because we are using the Token class's AttributeFactory, all attributes are
// in fact backed by the Token class, so we just copy the current token into our Token:
tokens[upto++].copyTo((Token) offsetAtt);
return true;
} else {
return false;