From 9ed722f5655639dd572853df5a5a14130323cf0f Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Tue, 11 Apr 2017 15:37:42 -0400 Subject: [PATCH] LUCENE-7760: improve setMaxTokenLength javadocs for StandardAnalyzer/Tokenizer and UAX29URLEmailAnalyzer/Tokenizer --- .../standard/UAX29URLEmailAnalyzer.java | 11 +++++---- .../standard/UAX29URLEmailTokenizer.java | 23 +++++++++++++++---- .../standard/TestUAX29URLEmailAnalyzer.java | 23 +++++++++++++++++++ .../standard/TestUAX29URLEmailTokenizer.java | 2 +- .../analysis/standard/StandardAnalyzer.java | 11 +++++---- .../analysis/standard/StandardTokenizer.java | 6 ++++- .../standard/TestStandardAnalyzer.java | 23 +++++++++++++++++++ 7 files changed, 85 insertions(+), 14 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java index fe71b7e83f1..282c2e7ea97 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java @@ -66,10 +66,11 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase { } /** - * Set maximum allowed token length. If a token is seen - * that exceeds this length then it is discarded. This - * setting only takes effect the next time tokenStream or - * tokenStream is called. + * Set the max allowed token length. Tokens larger than this will be chopped + * up at this token length and emitted as multiple tokens. If you need to + * skip such large tokens, you could increase this max length, and then + * use {@code LengthFilter} to remove long tokens. The default is + * {@link UAX29URLEmailAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}. */ public void setMaxTokenLength(int length) { maxTokenLength = length; @@ -92,6 +93,8 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase { return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) { + // So that if maxTokenLength was changed, the change takes + // effect next time tokenStream is called: src.setMaxTokenLength(UAX29URLEmailAnalyzer.this.maxTokenLength); super.setReader(reader); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java index d2b02e43784..842ae510fc2 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java @@ -72,19 +72,34 @@ public final class UAX29URLEmailTokenizer extends Tokenizer { "", "", }; + + /** Absolute maximum sized token */ + public static final int MAX_TOKEN_LENGTH_LIMIT = 1024 * 1024; private int skippedPositions; private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; - /** Set the max allowed token length. Any token longer - * than this is skipped. */ + /** + * Set the max allowed token length. Tokens larger than this will be chopped + * up at this token length and emitted as multiple tokens. If you need to + * skip such large tokens, you could increase this max length, and then + * use {@code LengthFilter} to remove long tokens. The default is + * {@link UAX29URLEmailAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}. + * + * @throws IllegalArgumentException if the given length is outside of the + * range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}]. + */ public void setMaxTokenLength(int length) { if (length < 1) { throw new IllegalArgumentException("maxTokenLength must be greater than zero"); + } else if (length > MAX_TOKEN_LENGTH_LIMIT) { + throw new IllegalArgumentException("maxTokenLength may not exceed " + MAX_TOKEN_LENGTH_LIMIT); + } + if (length != maxTokenLength) { + this.maxTokenLength = length; + scanner.setBufferSize(length); } - this.maxTokenLength = length; - scanner.setBufferSize(Math.min(length, 1024 * 1024)); // limit buffer size to 1M chars } /** @see #setMaxTokenLength */ diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java index 14a5165e6bb..b9321784dd0 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java @@ -357,4 +357,27 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); } + + public void testMaxTokenLengthDefault() throws Exception { + + StringBuilder bToken = new StringBuilder(); + // exact max length: + for(int i=0;i