LUCENE-9153: Allow WhitespaceAnalyzer to set a custom maxTokenLen (#1198)

WhitespaceTokenizer defaults to a maximum token length of 255, and WhitespaceAnalyzer does not allow this to be changed. This commit adds an optional maxTokenLen parameter to WhitespaceAnalyzer as well, and documents the existing token length restriction.
2020-01-27 09:22:25 +00:00 · 2020-01-27 09:22:25 +00:00 · 02f862670e
parent 9ddd05cd14
commit 02f862670e
4 changed files with 77 additions and 2 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -127,6 +127,9 @@ Improvements
  first match in the passage. Also the sizing point now pivots at the center of the first match term and not its left
  edge. This yields Passages that won't be identical to the previous behavior. (Nándor Mátravölgyi, David Smiley)

+* LUCENE-9153: Allow WhitespaceAnalyzer to set a maxTokenLength other than the default of 255
+  (Alan Woodward)
+
 Optimizations
 ---------------------

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceAnalyzer.java
@ -25,15 +25,26 @@ import org.apache.lucene.analysis.Analyzer;
 * @since 3.1
 **/
 public final class WhitespaceAnalyzer extends Analyzer {
+
+  private final int maxTokenLength;
  
  /**
-   * Creates a new {@link WhitespaceAnalyzer}
+   * Creates a new {@link WhitespaceAnalyzer} with a maximum token length of 255 chars
   */
  public WhitespaceAnalyzer() {
+    this(WhitespaceTokenizer.DEFAULT_MAX_WORD_LEN);
+  }
+
+  /**
+   * Creates a new {@link WhitespaceAnalyzer} with a custom maximum token length
+   * @param maxTokenLength the maximum token length the analyzer will emit
+   */
+  public WhitespaceAnalyzer(int maxTokenLength) {
+    this.maxTokenLength = maxTokenLength;
  }
  
  @Override
  protected TokenStreamComponents createComponents(final String fieldName) {
-    return new TokenStreamComponents(new WhitespaceTokenizer());
+    return new TokenStreamComponents(new WhitespaceTokenizer(maxTokenLength));
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java
@ -17,6 +17,7 @@
 package org.apache.lucene.analysis.core;


+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.util.CharTokenizer;
 import org.apache.lucene.util.AttributeFactory;
@ -47,6 +48,17 @@ public final class WhitespaceTokenizer extends CharTokenizer {
    super(factory);
  }

+  /**
+   * Construct a new WhitespaceTokenizer using a given max token length
+   *
+   * @param maxTokenLen maximum token length the tokenizer will emit.
+   *        Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
+   * @throws IllegalArgumentException if maxTokenLen is invalid.
+   */
+  public WhitespaceTokenizer(int maxTokenLen) {
+    super(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, maxTokenLen);
+  }
+
  /**
   * Construct a new WhitespaceTokenizer using a given
   * {@link org.apache.lucene.util.AttributeFactory}.
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestWhitespaceAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestWhitespaceAnalyzer.java
@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.core;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+
+public class TestWhitespaceAnalyzer extends BaseTokenStreamTestCase {
+
+  private static final String LONGTOKEN =
+            "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"
+          + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"
+          + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz";
+
+  public void testDefaultMaximumTokenLength() throws IOException {
+    try (Analyzer a = new WhitespaceAnalyzer()) {
+      assertAnalyzesTo(a, LONGTOKEN + " extra", new String[]{
+          "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"
+          + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"
+          + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstu",
+          "vwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz", "extra"
+      });
+    }
+  }
+
+  public void testCustomMaximumTokenLength() throws IOException {
+    try (Analyzer a = new WhitespaceAnalyzer(1024)) {
+      assertAnalyzesTo(a, LONGTOKEN + " extra", new String[] { LONGTOKEN, "extra" });
+    }
+  }
+
+}