LUCENE-7760: improve setMaxTokenLength javadocs for StandardAnalyzer/Tokenizer and UAX29URLEmailAnalyzer/Tokenizer

This commit is contained in:
Mike McCandless 2017-04-11 15:37:42 -04:00
parent 5ebd41d13f
commit 9ed722f565
7 changed files with 85 additions and 14 deletions

View File

@ -66,10 +66,11 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
}
/**
* Set maximum allowed token length. If a token is seen
* that exceeds this length then it is discarded. This
* setting only takes effect the next time tokenStream or
* tokenStream is called.
* Set the max allowed token length. Tokens larger than this will be chopped
* up at this token length and emitted as multiple tokens. If you need to
* skip such large tokens, you could increase this max length, and then
* use {@code LengthFilter} to remove long tokens. The default is
* {@link UAX29URLEmailAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
*/
public void setMaxTokenLength(int length) {
maxTokenLength = length;
@ -92,6 +93,8 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
return new TokenStreamComponents(src, tok) {
@Override
protected void setReader(final Reader reader) {
// So that if maxTokenLength was changed, the change takes
// effect next time tokenStream is called:
src.setMaxTokenLength(UAX29URLEmailAnalyzer.this.maxTokenLength);
super.setReader(reader);
}

View File

@ -72,19 +72,34 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
"<URL>",
"<EMAIL>",
};
/** Absolute maximum sized token */
public static final int MAX_TOKEN_LENGTH_LIMIT = 1024 * 1024;
private int skippedPositions;
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
/** Set the max allowed token length. Any token longer
* than this is skipped. */
/**
* Set the max allowed token length. Tokens larger than this will be chopped
* up at this token length and emitted as multiple tokens. If you need to
* skip such large tokens, you could increase this max length, and then
* use {@code LengthFilter} to remove long tokens. The default is
* {@link UAX29URLEmailAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
*
* @throws IllegalArgumentException if the given length is outside of the
* range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
*/
public void setMaxTokenLength(int length) {
if (length < 1) {
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
} else if (length > MAX_TOKEN_LENGTH_LIMIT) {
throw new IllegalArgumentException("maxTokenLength may not exceed " + MAX_TOKEN_LENGTH_LIMIT);
}
if (length != maxTokenLength) {
this.maxTokenLength = length;
scanner.setBufferSize(length);
}
this.maxTokenLength = length;
scanner.setBufferSize(Math.min(length, 1024 * 1024)); // limit buffer size to 1M chars
}
/** @see #setMaxTokenLength */

View File

@ -357,4 +357,27 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
}
public void testMaxTokenLengthDefault() throws Exception {
StringBuilder bToken = new StringBuilder();
// exact max length:
for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
bToken.append('b');
}
String bString = bToken.toString();
// first bString is exact max default length; next one is 1 too long
String input = "x " + bString + " " + bString + "b";
assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
a.close();
}
public void testMaxTokenLengthNonDefault() throws Exception {
UAX29URLEmailAnalyzer a = new UAX29URLEmailAnalyzer();
a.setMaxTokenLength(5);
assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
a.close();
}
}

View File

@ -105,7 +105,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
tokenizer.setMaxTokenLength(UAX29URLEmailTokenizer.MAX_TOKEN_LENGTH_LIMIT); // Tokenize arbitrary length URLs
TokenFilter filter = new URLFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}

View File

@ -81,10 +81,11 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
}
/**
* Set maximum allowed token length. If a token is seen
* that exceeds this length then it is discarded. This
* setting only takes effect the next time tokenStream or
* tokenStream is called.
* Set the max allowed token length. Tokens larger than this will be chopped
* up at this token length and emitted as multiple tokens. If you need to
* skip such large tokens, you could increase this max length, and then
* use {@code LengthFilter} to remove long tokens. The default is
* {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
*/
public void setMaxTokenLength(int length) {
maxTokenLength = length;
@ -107,6 +108,8 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
return new TokenStreamComponents(src, tok) {
@Override
protected void setReader(final Reader reader) {
// So that if maxTokenLength was changed, the change takes
// effect next time tokenStream is called:
src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
super.setReader(reader);
}

View File

@ -105,7 +105,11 @@ public final class StandardTokenizer extends Tokenizer {
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
/**
* Set the max allowed token length. No tokens longer than this are emitted.
* Set the max allowed token length. Tokens larger than this will be chopped
* up at this token length and emitted as multiple tokens. If you need to
* skip such large tokens, you could increase this max length, and then
* use {@code LengthFilter} to remove long tokens. The default is
* {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
*
* @throws IllegalArgumentException if the given length is outside of the
* range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].

View File

@ -393,4 +393,27 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
Analyzer a = new StandardAnalyzer();
assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
}
public void testMaxTokenLengthDefault() throws Exception {
StandardAnalyzer a = new StandardAnalyzer();
StringBuilder bToken = new StringBuilder();
// exact max length:
for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
bToken.append('b');
}
String bString = bToken.toString();
// first bString is exact max default length; next one is 1 too long
String input = "x " + bString + " " + bString + "b";
assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
a.close();
}
public void testMaxTokenLengthNonDefault() throws Exception {
StandardAnalyzer a = new StandardAnalyzer();
a.setMaxTokenLength(5);
assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
a.close();
}
}