LUCENE-7760: improve setMaxTokenLength javadocs for StandardAnalyzer/Tokenizer and UAX29URLEmailAnalyzer/Tokenizer

2017-04-11 15:37:42 -04:00 · 2017-04-11 15:37:42 -04:00 · 9ed722f565
parent 5ebd41d13f
commit 9ed722f565
7 changed files with 85 additions and 14 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
@ -66,10 +66,11 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
  }

  /**
-   * Set maximum allowed token length.  If a token is seen
-   * that exceeds this length then it is discarded.  This
-   * setting only takes effect the next time tokenStream or
-   * tokenStream is called.
+   * Set the max allowed token length.  Tokens larger than this will be chopped
+   * up at this token length and emitted as multiple tokens.  If you need to
+   * skip such large tokens, you could increase this max length, and then
+   * use {@code LengthFilter} to remove long tokens.  The default is
+   * {@link UAX29URLEmailAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
   */
  public void setMaxTokenLength(int length) {
    maxTokenLength = length;
@ -92,6 +93,8 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
    return new TokenStreamComponents(src, tok) {
      @Override
      protected void setReader(final Reader reader) {
+        // So that if maxTokenLength was changed, the change takes
+        // effect next time tokenStream is called:
        src.setMaxTokenLength(UAX29URLEmailAnalyzer.this.maxTokenLength);
        super.setReader(reader);
      }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
@ -72,19 +72,34 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
    "<URL>",
    "<EMAIL>",
  };
+
+  /** Absolute maximum sized token */
+  public static final int MAX_TOKEN_LENGTH_LIMIT = 1024 * 1024;
  
  private int skippedPositions;

  private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;

-  /** Set the max allowed token length.  Any token longer
-   *  than this is skipped. */
+  /**
+   * Set the max allowed token length.  Tokens larger than this will be chopped
+   * up at this token length and emitted as multiple tokens.  If you need to
+   * skip such large tokens, you could increase this max length, and then
+   * use {@code LengthFilter} to remove long tokens.  The default is
+   * {@link UAX29URLEmailAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
+   * 
+   * @throws IllegalArgumentException if the given length is outside of the
+   *  range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
+   */ 
  public void setMaxTokenLength(int length) {
    if (length < 1) {
      throw new IllegalArgumentException("maxTokenLength must be greater than zero");
+    } else if (length > MAX_TOKEN_LENGTH_LIMIT) {
+      throw new IllegalArgumentException("maxTokenLength may not exceed " + MAX_TOKEN_LENGTH_LIMIT);
+    }
+    if (length != maxTokenLength) {
+      this.maxTokenLength = length;
+      scanner.setBufferSize(length);
    }
-    this.maxTokenLength = length;
-    scanner.setBufferSize(Math.min(length, 1024 * 1024)); // limit buffer size to 1M chars
  }

  /** @see #setMaxTokenLength */
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
@ -357,4 +357,27 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {
  public void testRandomStrings() throws Exception {
    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
  }
+
+  public void testMaxTokenLengthDefault() throws Exception {
+
+    StringBuilder bToken = new StringBuilder();
+    // exact max length:
+    for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
+      bToken.append('b');
+    }
+
+    String bString = bToken.toString();
+    // first bString is exact max default length; next one is 1 too long
+    String input = "x " + bString + " " + bString + "b";
+    assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
+    a.close();
+  }
+
+  public void testMaxTokenLengthNonDefault() throws Exception {
+    UAX29URLEmailAnalyzer a = new UAX29URLEmailAnalyzer();
+    a.setMaxTokenLength(5);
+    assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
+    a.close();
+  }
+  
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
@ -105,7 +105,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
-        tokenizer.setMaxTokenLength(Integer.MAX_VALUE);  // Tokenize arbitrary length URLs
+        tokenizer.setMaxTokenLength(UAX29URLEmailTokenizer.MAX_TOKEN_LENGTH_LIMIT);  // Tokenize arbitrary length URLs
        TokenFilter filter = new URLFilter(tokenizer);
        return new TokenStreamComponents(tokenizer, filter);
      }
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@ -81,10 +81,11 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
  }

  /**
-   * Set maximum allowed token length.  If a token is seen
-   * that exceeds this length then it is discarded.  This
-   * setting only takes effect the next time tokenStream or
-   * tokenStream is called.
+   * Set the max allowed token length.  Tokens larger than this will be chopped
+   * up at this token length and emitted as multiple tokens.  If you need to
+   * skip such large tokens, you could increase this max length, and then
+   * use {@code LengthFilter} to remove long tokens.  The default is
+   * {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
   */
  public void setMaxTokenLength(int length) {
    maxTokenLength = length;
@ -107,6 +108,8 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
    return new TokenStreamComponents(src, tok) {
      @Override
      protected void setReader(final Reader reader) {
+        // So that if maxTokenLength was changed, the change takes
+        // effect next time tokenStream is called:
        src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
        super.setReader(reader);
      }
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@ -105,7 +105,11 @@ public final class StandardTokenizer extends Tokenizer {
  private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;

  /**
-   * Set the max allowed token length.  No tokens longer than this are emitted.
+   * Set the max allowed token length.  Tokens larger than this will be chopped
+   * up at this token length and emitted as multiple tokens.  If you need to
+   * skip such large tokens, you could increase this max length, and then
+   * use {@code LengthFilter} to remove long tokens.  The default is
+   * {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
   * 
   * @throws IllegalArgumentException if the given length is outside of the
   *  range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
--- a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
@ -393,4 +393,27 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
    Analyzer a = new StandardAnalyzer();
    assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
  }
+
+  public void testMaxTokenLengthDefault() throws Exception {
+    StandardAnalyzer a = new StandardAnalyzer();
+
+    StringBuilder bToken = new StringBuilder();
+    // exact max length:
+    for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
+      bToken.append('b');
+    }
+
+    String bString = bToken.toString();
+    // first bString is exact max default length; next one is 1 too long
+    String input = "x " + bString + " " + bString + "b";
+    assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
+    a.close();
+  }
+
+  public void testMaxTokenLengthNonDefault() throws Exception {
+    StandardAnalyzer a = new StandardAnalyzer();
+    a.setMaxTokenLength(5);
+    assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
+    a.close();
+  }
 }