From 4ffa050735ad7bce2b8574eceaab8bb0f3098aaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Wed, 20 Nov 2019 10:36:39 +0100 Subject: [PATCH] Allow custom characters in token_chars of ngram tokenizers (#49250) Currently the `token_chars` setting in both `edgeNGram` and `ngram` tokenizers only allows for a list of predefined character classes, which might not fit every use case. For example, including underscore "_" in a token would currently require the `punctuation` class which comes with a lot of other characters. This change adds an additional "custom" option to the `token_chars` setting, which requires an additional `custom_token_chars` setting to be present and which will be interpreted as a set of characters to inlcude into a token. Closes #25894 --- .../tokenizers/edgengram-tokenizer.asciidoc | 8 ++++ .../tokenizers/ngram-tokenizer.asciidoc | 8 ++++ .../common/EdgeNGramTokenizerFactory.java | 2 +- .../common/NGramTokenizerFactory.java | 27 +++++++++-- .../common/EdgeNGramTokenizerTests.java | 16 +++++++ .../common/NGramTokenizerFactoryTests.java | 48 ++++++++++++++----- 6 files changed, 92 insertions(+), 17 deletions(-) diff --git a/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc index 79ca7fa4231..ea30f83af1e 100644 --- a/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc @@ -96,6 +96,14 @@ Character classes may be any of the following: * `whitespace` -- for example `" "` or `"\n"` * `punctuation` -- for example `!` or `"` * `symbol` -- for example `$` or `√` +* `custom` -- custom characters which need to be set using the +`custom_token_chars` setting. + +`custom_token_chars`:: + + Custom characters that should be treated as part of a token. For example, + setting this to `+-_` will make the tokenizer treat the plus, minus and + underscore sign as part of a token. [[max-gram-limits]] === Limitations of the `max_gram` parameter diff --git a/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc index 7a26304e455..a30266d5088 100644 --- a/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc @@ -190,6 +190,14 @@ Character classes may be any of the following: * `whitespace` -- for example `" "` or `"\n"` * `punctuation` -- for example `!` or `"` * `symbol` -- for example `$` or `√` +* `custom` -- custom characters which need to be set using the +`custom_token_chars` setting. + +`custom_token_chars`:: + + Custom characters that should be treated as part of a token. For example, + setting this to `+-_` will make the tokenizer treat the plus, minus and + underscore sign as part of a token. TIP: It usually makes sense to set `min_gram` and `max_gram` to the same value. The smaller the length, the more documents will match but the lower diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerFactory.java index 2a366513f4e..f67238334f1 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerFactory.java @@ -39,7 +39,7 @@ public class EdgeNGramTokenizerFactory extends AbstractTokenizerFactory { super(indexSettings, settings, name); this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); - this.matcher = parseTokenChars(settings.getAsList("token_chars")); + this.matcher = parseTokenChars(settings); } @Override diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenizerFactory.java index ff3b27b021e..c904e52cd45 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenizerFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenizerFactory.java @@ -29,10 +29,14 @@ import org.elasticsearch.index.analysis.AbstractTokenizerFactory; import java.lang.reflect.Field; import java.lang.reflect.Modifier; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; import static java.util.Collections.unmodifiableMap; @@ -68,7 +72,8 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory { MATCHERS = unmodifiableMap(matchers); } - static CharMatcher parseTokenChars(List characterClasses) { + static CharMatcher parseTokenChars(Settings settings) { + List characterClasses = settings.getAsList("token_chars"); if (characterClasses == null || characterClasses.isEmpty()) { return null; } @@ -77,7 +82,23 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory { characterClass = characterClass.toLowerCase(Locale.ROOT).trim(); CharMatcher matcher = MATCHERS.get(characterClass); if (matcher == null) { - throw new IllegalArgumentException("Unknown token type: '" + characterClass + "', must be one of " + MATCHERS.keySet()); + if (characterClass.equals("custom") == false) { + throw new IllegalArgumentException("Unknown token type: '" + characterClass + "', must be one of " + Stream + .of(MATCHERS.keySet(), Collections.singleton("custom")).flatMap(x -> x.stream()).collect(Collectors.toSet())); + } + String customCharacters = settings.get("custom_token_chars"); + if (customCharacters == null) { + throw new IllegalArgumentException("Token type: 'custom' requires setting `custom_token_chars`"); + } + final Set customCharSet = customCharacters.chars().boxed().collect(Collectors.toSet()); + matcher = new CharMatcher() { + + @Override + public boolean isTokenChar(int c) { + return customCharSet.contains(c); + } + + }; } builder.or(matcher); } @@ -101,7 +122,7 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory { + "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]"); } } - this.matcher = parseTokenChars(settings.getAsList("token_chars")); + this.matcher = parseTokenChars(settings); } @Override diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java index 0172f7cbc26..95bf41f8e92 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java @@ -19,11 +19,13 @@ package org.elasticsearch.analysis.common; +import org.apache.lucene.analysis.Tokenizer; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.env.TestEnvironment; +import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.NamedAnalyzer; @@ -33,6 +35,7 @@ import org.elasticsearch.test.IndexSettingsModule; import org.elasticsearch.test.VersionUtils; import java.io.IOException; +import java.io.StringReader; import java.util.Collections; public class EdgeNGramTokenizerTests extends ESTokenStreamTestCase { @@ -95,4 +98,17 @@ public class EdgeNGramTokenizerTests extends ESTokenStreamTestCase { } + public void testCustomTokenChars() throws IOException { + final Index index = new Index("test", "_na_"); + final String name = "engr"; + final Settings indexSettings = newAnalysisSettingsBuilder().put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 2).build(); + + final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3) + .putList("token_chars", "letter", "custom").put("custom_token_chars","_-").build(); + Tokenizer tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, + settings).create(); + tokenizer.setReader(new StringReader("Abc -gh _jk =lm")); + assertTokenStreamContents(tokenizer, new String[] {"Ab", "Abc", "-g", "-gh", "_j", "_jk", "lm"}); + } + } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java index 1cf6ef4696d..b18afbf6ae3 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java @@ -46,25 +46,34 @@ public class NGramTokenizerFactoryTests extends ESTokenStreamTestCase { final Index index = new Index("test", "_na_"); final String name = "ngr"; final Settings indexSettings = newAnalysisSettingsBuilder().build(); - IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); - for (String tokenChars : Arrays.asList("letters", "number", "DIRECTIONALITY_UNDEFINED")) { - final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3) - .put("token_chars", tokenChars).build(); - try { - new NGramTokenizerFactory(indexProperties, null, name, settings).create(); - fail(); - } catch (IllegalArgumentException expected) { - // OK - } - } + final IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); for (String tokenChars : Arrays.asList("letter", " digit ", "punctuation", "DIGIT", "CoNtRoL", "dash_punctuation")) { final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3) .put("token_chars", tokenChars).build(); - indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); - new NGramTokenizerFactory(indexProperties, null, name, settings).create(); // no exception } + { + final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3) + .put("token_chars", "DIRECTIONALITY_UNDEFINED").build(); + IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, + () -> new NGramTokenizerFactory(indexProperties, null, name, settings).create()); + assertEquals("Unknown token type: 'directionality_undefined'", ex.getMessage().substring(0, 46)); + assertTrue(ex.getMessage().contains("custom")); + } + { + final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "custom") + .put("custom_token_chars", "_-").build(); + new NGramTokenizerFactory(indexProperties, null, name, settings).create(); + // no exception + } + { + final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "custom") + .build(); + IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, + () -> new NGramTokenizerFactory(indexProperties, null, name, settings).create()); + assertEquals("Token type: 'custom' requires setting `custom_token_chars`", ex.getMessage()); + } } public void testNoTokenChars() throws IOException { @@ -80,6 +89,19 @@ public class NGramTokenizerFactoryTests extends ESTokenStreamTestCase { assertTokenStreamContents(tokenizer, new String[] {"1.", "1.3", "1.34", ".3", ".34", "34"}); } + public void testCustomTokenChars() throws IOException { + final Index index = new Index("test", "_na_"); + final String name = "ngr"; + final Settings indexSettings = newAnalysisSettingsBuilder().put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 2).build(); + + final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3) + .putList("token_chars", "letter", "custom").put("custom_token_chars","_-").build(); + Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings) + .create(); + tokenizer.setReader(new StringReader("Abc -gh _jk =lm")); + assertTokenStreamContents(tokenizer, new String[] {"Ab", "Abc", "bc", "-g", "-gh", "gh", "_j", "_jk", "jk", "lm"}); + } + public void testPreTokenization() throws IOException { // Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters final Index index = new Index("test", "_na_");