From 19a336e8d33c9360e32af80118e75619a8ee0d30 Mon Sep 17 00:00:00 2001 From: Andrei Balici Date: Wed, 20 May 2020 13:15:57 +0100 Subject: [PATCH] Add `max_token_length` setting to the CharGroupTokenizer (#56860) Adds `max_token_length` option to the CharGroupTokenizer. Updates documentation as well to reflect the changes. Closes #56676 --- .../tokenizers/chargroup-tokenizer.asciidoc | 6 ++- .../common/CharGroupTokenizerFactory.java | 8 ++- .../CharGroupTokenizerFactoryTests.java | 52 ++++++++++++++++++- 3 files changed, 63 insertions(+), 3 deletions(-) diff --git a/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc index 443ed3c2962..92329fab543 100644 --- a/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc @@ -13,11 +13,15 @@ The `char_group` tokenizer accepts one parameter: [horizontal] `tokenize_on_chars`:: - A list containing a list of characters to tokenize the string on. Whenever a character + A list containing a list of characters to tokenize the string on. Whenever a character from this list is encountered, a new token is started. This accepts either single characters like e.g. `-`, or character groups: `whitespace`, `letter`, `digit`, `punctuation`, `symbol`. +`max_token_length`:: + The maximum token length. If a token is seen that exceeds this length then + it is split at `max_token_length` intervals. Defaults to `255`. + [float] === Example output diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java index 35770be0b01..a31609fc286 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java @@ -21,6 +21,7 @@ package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.util.CharTokenizer; +import org.apache.lucene.util.AttributeFactory; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -31,7 +32,10 @@ import java.util.Set; public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{ + static final String MAX_TOKEN_LENGTH = "max_token_length"; + private final Set tokenizeOnChars = new HashSet<>(); + private final Integer maxTokenLength; private boolean tokenizeOnSpace = false; private boolean tokenizeOnLetter = false; private boolean tokenizeOnDigit = false; @@ -41,6 +45,8 @@ public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{ public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, settings, name); + maxTokenLength = settings.getAsInt(MAX_TOKEN_LENGTH, CharTokenizer.DEFAULT_MAX_WORD_LEN); + for (final String c : settings.getAsList("tokenize_on_chars")) { if (c == null || c.length() == 0) { throw new RuntimeException("[tokenize_on_chars] cannot contain empty characters"); @@ -110,7 +116,7 @@ public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{ @Override public Tokenizer create() { - return new CharTokenizer() { + return new CharTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, maxTokenLength) { @Override protected boolean isTokenChar(int c) { if (tokenizeOnSpace && Character.isWhitespace(c)) { diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java index 1447531aa87..43cdf274956 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java @@ -19,7 +19,9 @@ package org.elasticsearch.analysis.common; +import com.carrotsearch.randomizedtesting.generators.RandomStrings; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexSettings; @@ -27,11 +29,12 @@ import org.elasticsearch.test.ESTokenStreamTestCase; import org.elasticsearch.test.IndexSettingsModule; import java.io.IOException; +import java.io.Reader; import java.io.StringReader; import java.util.Arrays; - public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase { + public void testParseTokenChars() { final Index index = new Index("test", "_na_"); final Settings indexSettings = newAnalysisSettingsBuilder().build(); @@ -61,6 +64,53 @@ public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase { } } + public void testMaxTokenLength() throws IOException { + final Index index = new Index("test", "_na_"); + final Settings indexSettings = newAnalysisSettingsBuilder().build(); + IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); + final String name = "cg"; + + String[] conf = new String[] {"-"}; + + final Settings defaultLengthSettings = newAnalysisSettingsBuilder() + .putList("tokenize_on_chars", conf) + .build(); + CharTokenizer tokenizer = (CharTokenizer) new CharGroupTokenizerFactory(indexProperties, null, name, defaultLengthSettings) + .create(); + String textWithVeryLongToken = RandomStrings.randomAsciiAlphanumOfLength(random(), 256).concat("-trailing"); + try (Reader reader = new StringReader(textWithVeryLongToken)) { + tokenizer.setReader(reader); + assertTokenStreamContents(tokenizer, new String[] { textWithVeryLongToken.substring(0, 255), + textWithVeryLongToken.substring(255, 256), "trailing"}); + } + + final Settings analysisSettings = newAnalysisSettingsBuilder() + .putList("tokenize_on_chars", conf) + .put("max_token_length", 2) + .build(); + tokenizer = (CharTokenizer) new CharGroupTokenizerFactory(indexProperties, null, name, analysisSettings).create(); + try (Reader reader = new StringReader("one-two-three")) { + tokenizer.setReader(reader); + assertTokenStreamContents(tokenizer, new String[] { "on", "e", "tw", "o", "th", "re", "e" }); + } + + final Settings tooLongLengthSettings = newAnalysisSettingsBuilder() + .putList("tokenize_on_chars", conf) + .put("max_token_length", 1024 * 1024 + 1) + .build(); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> new CharGroupTokenizerFactory(indexProperties, null, name, tooLongLengthSettings).create()); + assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 1048577", e.getMessage()); + + final Settings negativeLengthSettings = newAnalysisSettingsBuilder() + .putList("tokenize_on_chars", conf) + .put("max_token_length", -1) + .build(); + e = expectThrows(IllegalArgumentException.class, + () -> new CharGroupTokenizerFactory(indexProperties, null, name, negativeLengthSettings).create()); + assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage()); + } + public void testTokenization() throws IOException { final Index index = new Index("test", "_na_"); final String name = "cg";