From 02f862670eee5a117824d61168a1f61cf145bc5f Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 27 Jan 2020 09:22:25 +0000 Subject: [PATCH] LUCENE-9153: Allow WhitespaceAnalyzer to set a custom maxTokenLen (#1198) WhitespaceTokenizer defaults to a maximum token length of 255, and WhitespaceAnalyzer does not allow this to be changed. This commit adds an optional maxTokenLen parameter to WhitespaceAnalyzer as well, and documents the existing token length restriction. --- lucene/CHANGES.txt | 3 ++ .../analysis/core/WhitespaceAnalyzer.java | 15 +++++- .../analysis/core/WhitespaceTokenizer.java | 12 +++++ .../analysis/core/TestWhitespaceAnalyzer.java | 49 +++++++++++++++++++ 4 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestWhitespaceAnalyzer.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 37f858a6711..95d8b574f6b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -127,6 +127,9 @@ Improvements first match in the passage. Also the sizing point now pivots at the center of the first match term and not its left edge. This yields Passages that won't be identical to the previous behavior. (Nándor Mátravölgyi, David Smiley) +* LUCENE-9153: Allow WhitespaceAnalyzer to set a maxTokenLength other than the default of 255 + (Alan Woodward) + Optimizations --------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceAnalyzer.java index 14c3219abdf..d8ba627e837 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceAnalyzer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceAnalyzer.java @@ -25,15 +25,26 @@ import org.apache.lucene.analysis.Analyzer; * @since 3.1 **/ public final class WhitespaceAnalyzer extends Analyzer { + + private final int maxTokenLength; /** - * Creates a new {@link WhitespaceAnalyzer} + * Creates a new {@link WhitespaceAnalyzer} with a maximum token length of 255 chars */ public WhitespaceAnalyzer() { + this(WhitespaceTokenizer.DEFAULT_MAX_WORD_LEN); + } + + /** + * Creates a new {@link WhitespaceAnalyzer} with a custom maximum token length + * @param maxTokenLength the maximum token length the analyzer will emit + */ + public WhitespaceAnalyzer(int maxTokenLength) { + this.maxTokenLength = maxTokenLength; } @Override protected TokenStreamComponents createComponents(final String fieldName) { - return new TokenStreamComponents(new WhitespaceTokenizer()); + return new TokenStreamComponents(new WhitespaceTokenizer(maxTokenLength)); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java index 065522761d0..f0fa7eaf508 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.core; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.util.AttributeFactory; @@ -47,6 +48,17 @@ public final class WhitespaceTokenizer extends CharTokenizer { super(factory); } + /** + * Construct a new WhitespaceTokenizer using a given max token length + * + * @param maxTokenLen maximum token length the tokenizer will emit. + * Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024) + * @throws IllegalArgumentException if maxTokenLen is invalid. + */ + public WhitespaceTokenizer(int maxTokenLen) { + super(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, maxTokenLen); + } + /** * Construct a new WhitespaceTokenizer using a given * {@link org.apache.lucene.util.AttributeFactory}. diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestWhitespaceAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestWhitespaceAnalyzer.java new file mode 100644 index 00000000000..d4cae2e150c --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestWhitespaceAnalyzer.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.core; + +import java.io.IOException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; + +public class TestWhitespaceAnalyzer extends BaseTokenStreamTestCase { + + private static final String LONGTOKEN = + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz" + + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz" + + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"; + + public void testDefaultMaximumTokenLength() throws IOException { + try (Analyzer a = new WhitespaceAnalyzer()) { + assertAnalyzesTo(a, LONGTOKEN + " extra", new String[]{ + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz" + + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz" + + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstu", + "vwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz", "extra" + }); + } + } + + public void testCustomMaximumTokenLength() throws IOException { + try (Analyzer a = new WhitespaceAnalyzer(1024)) { + assertAnalyzesTo(a, LONGTOKEN + " extra", new String[] { LONGTOKEN, "extra" }); + } + } + +}