mirror of
https://github.com/apache/lucene.git
synced 2025-02-28 13:29:26 +00:00
allow MockTokenizer to take max token length; default to MAX_INT (= no change)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1125972 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
886e17c842
commit
3ec7abd684
@ -22,6 +22,7 @@ import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
@ -53,6 +54,8 @@ public class MockTokenizer extends Tokenizer {
|
||||
|
||||
private final CharacterRunAutomaton runAutomaton;
|
||||
private final boolean lowerCase;
|
||||
private final int maxTokenLength;
|
||||
public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE;
|
||||
private int state;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
@ -74,20 +77,21 @@ public class MockTokenizer extends Tokenizer {
|
||||
private State streamState = State.CLOSE;
|
||||
private boolean enableChecks = true;
|
||||
|
||||
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
|
||||
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
|
||||
super(factory, input);
|
||||
this.runAutomaton = runAutomaton;
|
||||
this.lowerCase = lowerCase;
|
||||
this.state = runAutomaton.getInitialState();
|
||||
this.streamState = State.SETREADER;
|
||||
this.maxTokenLength = maxTokenLength;
|
||||
}
|
||||
|
||||
public MockTokenizer(Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
|
||||
this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, runAutomaton, lowerCase, maxTokenLength);
|
||||
}
|
||||
|
||||
public MockTokenizer(Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
|
||||
super(input);
|
||||
this.runAutomaton = runAutomaton;
|
||||
this.lowerCase = lowerCase;
|
||||
this.state = runAutomaton.getInitialState();
|
||||
this.streamState = State.SETREADER;
|
||||
this(input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -107,6 +111,9 @@ public class MockTokenizer extends Tokenizer {
|
||||
for (int i = 0; i < chars.length; i++)
|
||||
termAtt.append(chars[i]);
|
||||
endOffset = off;
|
||||
if (termAtt.length() >= maxTokenLength) {
|
||||
break;
|
||||
}
|
||||
cp = readCodePoint();
|
||||
} while (cp >= 0 && isTokenChar(cp));
|
||||
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
|
||||
|
@ -225,7 +225,7 @@ public class TestToken extends LuceneTestCase {
|
||||
}
|
||||
|
||||
public void testTokenAttributeFactory() throws Exception {
|
||||
TokenStream ts = new MockTokenizer(Token.TOKEN_ATTRIBUTE_FACTORY, new StringReader("foo bar"), MockTokenizer.WHITESPACE, false);
|
||||
TokenStream ts = new MockTokenizer(Token.TOKEN_ATTRIBUTE_FACTORY, new StringReader("foo bar"), MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
|
||||
|
||||
assertTrue("SenselessAttribute is not implemented by SenselessAttributeImpl",
|
||||
ts.addAttribute(SenselessAttribute.class) instanceof SenselessAttributeImpl);
|
||||
|
Loading…
x
Reference in New Issue
Block a user