mirror of https://github.com/apache/lucene.git
LUCENE-7760: improve setMaxTokenLength javadocs for StandardAnalyzer/Tokenizer and UAX29URLEmailAnalyzer/Tokenizer
This commit is contained in:
parent
5ebd41d13f
commit
9ed722f565
|
@ -66,10 +66,11 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set maximum allowed token length. If a token is seen
|
* Set the max allowed token length. Tokens larger than this will be chopped
|
||||||
* that exceeds this length then it is discarded. This
|
* up at this token length and emitted as multiple tokens. If you need to
|
||||||
* setting only takes effect the next time tokenStream or
|
* skip such large tokens, you could increase this max length, and then
|
||||||
* tokenStream is called.
|
* use {@code LengthFilter} to remove long tokens. The default is
|
||||||
|
* {@link UAX29URLEmailAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
|
||||||
*/
|
*/
|
||||||
public void setMaxTokenLength(int length) {
|
public void setMaxTokenLength(int length) {
|
||||||
maxTokenLength = length;
|
maxTokenLength = length;
|
||||||
|
@ -92,6 +93,8 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
|
||||||
return new TokenStreamComponents(src, tok) {
|
return new TokenStreamComponents(src, tok) {
|
||||||
@Override
|
@Override
|
||||||
protected void setReader(final Reader reader) {
|
protected void setReader(final Reader reader) {
|
||||||
|
// So that if maxTokenLength was changed, the change takes
|
||||||
|
// effect next time tokenStream is called:
|
||||||
src.setMaxTokenLength(UAX29URLEmailAnalyzer.this.maxTokenLength);
|
src.setMaxTokenLength(UAX29URLEmailAnalyzer.this.maxTokenLength);
|
||||||
super.setReader(reader);
|
super.setReader(reader);
|
||||||
}
|
}
|
||||||
|
|
|
@ -73,18 +73,33 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
|
||||||
"<EMAIL>",
|
"<EMAIL>",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/** Absolute maximum sized token */
|
||||||
|
public static final int MAX_TOKEN_LENGTH_LIMIT = 1024 * 1024;
|
||||||
|
|
||||||
private int skippedPositions;
|
private int skippedPositions;
|
||||||
|
|
||||||
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
||||||
|
|
||||||
/** Set the max allowed token length. Any token longer
|
/**
|
||||||
* than this is skipped. */
|
* Set the max allowed token length. Tokens larger than this will be chopped
|
||||||
|
* up at this token length and emitted as multiple tokens. If you need to
|
||||||
|
* skip such large tokens, you could increase this max length, and then
|
||||||
|
* use {@code LengthFilter} to remove long tokens. The default is
|
||||||
|
* {@link UAX29URLEmailAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
|
||||||
|
*
|
||||||
|
* @throws IllegalArgumentException if the given length is outside of the
|
||||||
|
* range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
|
||||||
|
*/
|
||||||
public void setMaxTokenLength(int length) {
|
public void setMaxTokenLength(int length) {
|
||||||
if (length < 1) {
|
if (length < 1) {
|
||||||
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
|
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
|
||||||
|
} else if (length > MAX_TOKEN_LENGTH_LIMIT) {
|
||||||
|
throw new IllegalArgumentException("maxTokenLength may not exceed " + MAX_TOKEN_LENGTH_LIMIT);
|
||||||
}
|
}
|
||||||
|
if (length != maxTokenLength) {
|
||||||
this.maxTokenLength = length;
|
this.maxTokenLength = length;
|
||||||
scanner.setBufferSize(Math.min(length, 1024 * 1024)); // limit buffer size to 1M chars
|
scanner.setBufferSize(length);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @see #setMaxTokenLength */
|
/** @see #setMaxTokenLength */
|
||||||
|
|
|
@ -357,4 +357,27 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testMaxTokenLengthDefault() throws Exception {
|
||||||
|
|
||||||
|
StringBuilder bToken = new StringBuilder();
|
||||||
|
// exact max length:
|
||||||
|
for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
|
||||||
|
bToken.append('b');
|
||||||
|
}
|
||||||
|
|
||||||
|
String bString = bToken.toString();
|
||||||
|
// first bString is exact max default length; next one is 1 too long
|
||||||
|
String input = "x " + bString + " " + bString + "b";
|
||||||
|
assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMaxTokenLengthNonDefault() throws Exception {
|
||||||
|
UAX29URLEmailAnalyzer a = new UAX29URLEmailAnalyzer();
|
||||||
|
a.setMaxTokenLength(5);
|
||||||
|
assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -105,7 +105,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
|
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
|
||||||
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
|
tokenizer.setMaxTokenLength(UAX29URLEmailTokenizer.MAX_TOKEN_LENGTH_LIMIT); // Tokenize arbitrary length URLs
|
||||||
TokenFilter filter = new URLFilter(tokenizer);
|
TokenFilter filter = new URLFilter(tokenizer);
|
||||||
return new TokenStreamComponents(tokenizer, filter);
|
return new TokenStreamComponents(tokenizer, filter);
|
||||||
}
|
}
|
||||||
|
|
|
@ -81,10 +81,11 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set maximum allowed token length. If a token is seen
|
* Set the max allowed token length. Tokens larger than this will be chopped
|
||||||
* that exceeds this length then it is discarded. This
|
* up at this token length and emitted as multiple tokens. If you need to
|
||||||
* setting only takes effect the next time tokenStream or
|
* skip such large tokens, you could increase this max length, and then
|
||||||
* tokenStream is called.
|
* use {@code LengthFilter} to remove long tokens. The default is
|
||||||
|
* {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
|
||||||
*/
|
*/
|
||||||
public void setMaxTokenLength(int length) {
|
public void setMaxTokenLength(int length) {
|
||||||
maxTokenLength = length;
|
maxTokenLength = length;
|
||||||
|
@ -107,6 +108,8 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
|
||||||
return new TokenStreamComponents(src, tok) {
|
return new TokenStreamComponents(src, tok) {
|
||||||
@Override
|
@Override
|
||||||
protected void setReader(final Reader reader) {
|
protected void setReader(final Reader reader) {
|
||||||
|
// So that if maxTokenLength was changed, the change takes
|
||||||
|
// effect next time tokenStream is called:
|
||||||
src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
|
src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
|
||||||
super.setReader(reader);
|
super.setReader(reader);
|
||||||
}
|
}
|
||||||
|
|
|
@ -105,7 +105,11 @@ public final class StandardTokenizer extends Tokenizer {
|
||||||
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the max allowed token length. No tokens longer than this are emitted.
|
* Set the max allowed token length. Tokens larger than this will be chopped
|
||||||
|
* up at this token length and emitted as multiple tokens. If you need to
|
||||||
|
* skip such large tokens, you could increase this max length, and then
|
||||||
|
* use {@code LengthFilter} to remove long tokens. The default is
|
||||||
|
* {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
|
||||||
*
|
*
|
||||||
* @throws IllegalArgumentException if the given length is outside of the
|
* @throws IllegalArgumentException if the given length is outside of the
|
||||||
* range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
|
* range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
|
||||||
|
|
|
@ -393,4 +393,27 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
||||||
Analyzer a = new StandardAnalyzer();
|
Analyzer a = new StandardAnalyzer();
|
||||||
assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
|
assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testMaxTokenLengthDefault() throws Exception {
|
||||||
|
StandardAnalyzer a = new StandardAnalyzer();
|
||||||
|
|
||||||
|
StringBuilder bToken = new StringBuilder();
|
||||||
|
// exact max length:
|
||||||
|
for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
|
||||||
|
bToken.append('b');
|
||||||
|
}
|
||||||
|
|
||||||
|
String bString = bToken.toString();
|
||||||
|
// first bString is exact max default length; next one is 1 too long
|
||||||
|
String input = "x " + bString + " " + bString + "b";
|
||||||
|
assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMaxTokenLengthNonDefault() throws Exception {
|
||||||
|
StandardAnalyzer a = new StandardAnalyzer();
|
||||||
|
a.setMaxTokenLength(5);
|
||||||
|
assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue