mirror of https://github.com/apache/lucene.git
LUCENE-7760: improve setMaxTokenLength javadocs for StandardAnalyzer/Tokenizer and UAX29URLEmailAnalyzer/Tokenizer
This commit is contained in:
parent
5ebd41d13f
commit
9ed722f565
|
@ -66,10 +66,11 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
|
||||
/**
|
||||
* Set maximum allowed token length. If a token is seen
|
||||
* that exceeds this length then it is discarded. This
|
||||
* setting only takes effect the next time tokenStream or
|
||||
* tokenStream is called.
|
||||
* Set the max allowed token length. Tokens larger than this will be chopped
|
||||
* up at this token length and emitted as multiple tokens. If you need to
|
||||
* skip such large tokens, you could increase this max length, and then
|
||||
* use {@code LengthFilter} to remove long tokens. The default is
|
||||
* {@link UAX29URLEmailAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
|
||||
*/
|
||||
public void setMaxTokenLength(int length) {
|
||||
maxTokenLength = length;
|
||||
|
@ -92,6 +93,8 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
|
|||
return new TokenStreamComponents(src, tok) {
|
||||
@Override
|
||||
protected void setReader(final Reader reader) {
|
||||
// So that if maxTokenLength was changed, the change takes
|
||||
// effect next time tokenStream is called:
|
||||
src.setMaxTokenLength(UAX29URLEmailAnalyzer.this.maxTokenLength);
|
||||
super.setReader(reader);
|
||||
}
|
||||
|
|
|
@ -72,19 +72,34 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
|
|||
"<URL>",
|
||||
"<EMAIL>",
|
||||
};
|
||||
|
||||
/** Absolute maximum sized token */
|
||||
public static final int MAX_TOKEN_LENGTH_LIMIT = 1024 * 1024;
|
||||
|
||||
private int skippedPositions;
|
||||
|
||||
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
||||
|
||||
/** Set the max allowed token length. Any token longer
|
||||
* than this is skipped. */
|
||||
/**
|
||||
* Set the max allowed token length. Tokens larger than this will be chopped
|
||||
* up at this token length and emitted as multiple tokens. If you need to
|
||||
* skip such large tokens, you could increase this max length, and then
|
||||
* use {@code LengthFilter} to remove long tokens. The default is
|
||||
* {@link UAX29URLEmailAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
|
||||
*
|
||||
* @throws IllegalArgumentException if the given length is outside of the
|
||||
* range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
|
||||
*/
|
||||
public void setMaxTokenLength(int length) {
|
||||
if (length < 1) {
|
||||
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
|
||||
} else if (length > MAX_TOKEN_LENGTH_LIMIT) {
|
||||
throw new IllegalArgumentException("maxTokenLength may not exceed " + MAX_TOKEN_LENGTH_LIMIT);
|
||||
}
|
||||
if (length != maxTokenLength) {
|
||||
this.maxTokenLength = length;
|
||||
scanner.setBufferSize(length);
|
||||
}
|
||||
this.maxTokenLength = length;
|
||||
scanner.setBufferSize(Math.min(length, 1024 * 1024)); // limit buffer size to 1M chars
|
||||
}
|
||||
|
||||
/** @see #setMaxTokenLength */
|
||||
|
|
|
@ -357,4 +357,27 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testMaxTokenLengthDefault() throws Exception {
|
||||
|
||||
StringBuilder bToken = new StringBuilder();
|
||||
// exact max length:
|
||||
for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
|
||||
bToken.append('b');
|
||||
}
|
||||
|
||||
String bString = bToken.toString();
|
||||
// first bString is exact max default length; next one is 1 too long
|
||||
String input = "x " + bString + " " + bString + "b";
|
||||
assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
|
||||
a.close();
|
||||
}
|
||||
|
||||
public void testMaxTokenLengthNonDefault() throws Exception {
|
||||
UAX29URLEmailAnalyzer a = new UAX29URLEmailAnalyzer();
|
||||
a.setMaxTokenLength(5);
|
||||
assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
|
||||
a.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -105,7 +105,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
|
||||
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
|
||||
tokenizer.setMaxTokenLength(UAX29URLEmailTokenizer.MAX_TOKEN_LENGTH_LIMIT); // Tokenize arbitrary length URLs
|
||||
TokenFilter filter = new URLFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
|
|
|
@ -81,10 +81,11 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
|
||||
/**
|
||||
* Set maximum allowed token length. If a token is seen
|
||||
* that exceeds this length then it is discarded. This
|
||||
* setting only takes effect the next time tokenStream or
|
||||
* tokenStream is called.
|
||||
* Set the max allowed token length. Tokens larger than this will be chopped
|
||||
* up at this token length and emitted as multiple tokens. If you need to
|
||||
* skip such large tokens, you could increase this max length, and then
|
||||
* use {@code LengthFilter} to remove long tokens. The default is
|
||||
* {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
|
||||
*/
|
||||
public void setMaxTokenLength(int length) {
|
||||
maxTokenLength = length;
|
||||
|
@ -107,6 +108,8 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
|
|||
return new TokenStreamComponents(src, tok) {
|
||||
@Override
|
||||
protected void setReader(final Reader reader) {
|
||||
// So that if maxTokenLength was changed, the change takes
|
||||
// effect next time tokenStream is called:
|
||||
src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
|
||||
super.setReader(reader);
|
||||
}
|
||||
|
|
|
@ -105,7 +105,11 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
||||
|
||||
/**
|
||||
* Set the max allowed token length. No tokens longer than this are emitted.
|
||||
* Set the max allowed token length. Tokens larger than this will be chopped
|
||||
* up at this token length and emitted as multiple tokens. If you need to
|
||||
* skip such large tokens, you could increase this max length, and then
|
||||
* use {@code LengthFilter} to remove long tokens. The default is
|
||||
* {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
|
||||
*
|
||||
* @throws IllegalArgumentException if the given length is outside of the
|
||||
* range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
|
||||
|
|
|
@ -393,4 +393,27 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
|||
Analyzer a = new StandardAnalyzer();
|
||||
assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
|
||||
}
|
||||
|
||||
public void testMaxTokenLengthDefault() throws Exception {
|
||||
StandardAnalyzer a = new StandardAnalyzer();
|
||||
|
||||
StringBuilder bToken = new StringBuilder();
|
||||
// exact max length:
|
||||
for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
|
||||
bToken.append('b');
|
||||
}
|
||||
|
||||
String bString = bToken.toString();
|
||||
// first bString is exact max default length; next one is 1 too long
|
||||
String input = "x " + bString + " " + bString + "b";
|
||||
assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
|
||||
a.close();
|
||||
}
|
||||
|
||||
public void testMaxTokenLengthNonDefault() throws Exception {
|
||||
StandardAnalyzer a = new StandardAnalyzer();
|
||||
a.setMaxTokenLength(5);
|
||||
assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
|
||||
a.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue