allow MockTokenizer to take max token length; default to MAX_INT (= no change)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1125972 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-05-22 14:18:55 +00:00
parent 886e17c842
commit 3ec7abd684
2 changed files with 14 additions and 7 deletions

View File

@ -22,6 +22,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
@ -53,6 +54,8 @@ public class MockTokenizer extends Tokenizer {
private final CharacterRunAutomaton runAutomaton;
private final boolean lowerCase;
private final int maxTokenLength;
public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE;
private int state;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
@ -74,20 +77,21 @@ public class MockTokenizer extends Tokenizer {
private State streamState = State.CLOSE;
private boolean enableChecks = true;
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
super(factory, input);
this.runAutomaton = runAutomaton;
this.lowerCase = lowerCase;
this.state = runAutomaton.getInitialState();
this.streamState = State.SETREADER;
this.maxTokenLength = maxTokenLength;
}
public MockTokenizer(Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, runAutomaton, lowerCase, maxTokenLength);
}
public MockTokenizer(Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
super(input);
this.runAutomaton = runAutomaton;
this.lowerCase = lowerCase;
this.state = runAutomaton.getInitialState();
this.streamState = State.SETREADER;
this(input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH);
}
@Override
@ -107,6 +111,9 @@ public class MockTokenizer extends Tokenizer {
for (int i = 0; i < chars.length; i++)
termAtt.append(chars[i]);
endOffset = off;
if (termAtt.length() >= maxTokenLength) {
break;
}
cp = readCodePoint();
} while (cp >= 0 && isTokenChar(cp));
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));

View File

@ -225,7 +225,7 @@ public class TestToken extends LuceneTestCase {
}
public void testTokenAttributeFactory() throws Exception {
TokenStream ts = new MockTokenizer(Token.TOKEN_ATTRIBUTE_FACTORY, new StringReader("foo bar"), MockTokenizer.WHITESPACE, false);
TokenStream ts = new MockTokenizer(Token.TOKEN_ATTRIBUTE_FACTORY, new StringReader("foo bar"), MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
assertTrue("SenselessAttribute is not implemented by SenselessAttributeImpl",
ts.addAttribute(SenselessAttribute.class) instanceof SenselessAttributeImpl);