Add support for inlined user dictionary in Nori (#36123)

Add support for inlined user dictionary in Nori

This change adds a new option called `user_dictionary_rules` to the
Nori a tokenizer`. It can be used to set additional tokenization rules
to the Korean tokenizer directly in the settings (instead of using a file).

Closes #35842
This commit is contained in:
Jim Ferenczi 2018-12-07 15:26:08 +01:00 committed by GitHub
parent ca09936cdf
commit a53e8653f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 99 additions and 13 deletions

View File

@ -70,7 +70,7 @@ The first token is mandatory and represents the custom noun that should be added
the dictionary. For compound nouns the custom segmentation can be provided the dictionary. For compound nouns the custom segmentation can be provided
after the first token (`[<token 1> ... <token n>]`). The segmentation of the after the first token (`[<token 1> ... <token n>]`). The segmentation of the
custom compound nouns is controlled by the `decompound_mode` setting. custom compound nouns is controlled by the `decompound_mode` setting.
--
As a demonstration of how the user dictionary can be used, save the following As a demonstration of how the user dictionary can be used, save the following
dictionary to `$ES_HOME/config/userdict_ko.txt`: dictionary to `$ES_HOME/config/userdict_ko.txt`:
@ -153,6 +153,42 @@ The above `analyze` request returns the following:
// TESTRESPONSE // TESTRESPONSE
<1> This is a compound token that spans two positions (`mixed` mode). <1> This is a compound token that spans two positions (`mixed` mode).
--
`user_dictionary_rules`::
+
--
You can also inline the rules directly in the tokenizer definition using
the `user_dictionary_rules` option:
[source,js]
--------------------------------------------------
PUT nori_sample
{
"settings": {
"index": {
"analysis": {
"tokenizer": {
"nori_user_dict": {
"type": "nori_tokenizer",
"decompound_mode": "mixed",
"user_dictionary_rules": ["c++", "C샤프", "세종", "세종시 세종 시"]
}
},
"analyzer": {
"my_analyzer": {
"type": "custom",
"tokenizer": "nori_user_dict"
}
}
}
}
}
}
--------------------------------------------------
// CONSOLE
--
The `nori_tokenizer` sets a number of additional attributes per token that are used by token filters The `nori_tokenizer` sets a number of additional attributes per token that are used by token filters
to modify the stream. to modify the stream.

View File

@ -29,10 +29,13 @@ import org.elasticsearch.index.IndexSettings;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.io.StringReader;
import java.util.List;
import java.util.Locale; import java.util.Locale;
public class NoriTokenizerFactory extends AbstractTokenizerFactory { public class NoriTokenizerFactory extends AbstractTokenizerFactory {
private static final String USER_DICT_OPTION = "user_dictionary"; private static final String USER_DICT_PATH_OPTION = "user_dictionary";
private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules";
private final UserDictionary userDictionary; private final UserDictionary userDictionary;
private final KoreanTokenizer.DecompoundMode decompoundMode; private final KoreanTokenizer.DecompoundMode decompoundMode;
@ -44,12 +47,20 @@ public class NoriTokenizerFactory extends AbstractTokenizerFactory {
} }
public static UserDictionary getUserDictionary(Environment env, Settings settings) { public static UserDictionary getUserDictionary(Environment env, Settings settings) {
try (Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION)) { if (settings.get(USER_DICT_PATH_OPTION) != null && settings.get(USER_DICT_RULES_OPTION) != null) {
if (reader == null) { throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" +
return null; " with [" + USER_DICT_RULES_OPTION + "]");
} else { }
return UserDictionary.open(reader); List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION);
} StringBuilder sb = new StringBuilder();
if (ruleList == null || ruleList.isEmpty()) {
return null;
}
for (String line : ruleList) {
sb.append(line).append(System.lineSeparator());
}
try (Reader rulesReader = new StringReader(sb.toString())) {
return UserDictionary.open(rulesReader);
} catch (IOException e) { } catch (IOException e) {
throw new ElasticsearchException("failed to load nori user dictionary", e); throw new ElasticsearchException("failed to load nori user dictionary", e);
} }

View File

@ -38,6 +38,7 @@ import java.io.StringReader;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.instanceOf;
public class NoriAnalysisTests extends ESTokenStreamTestCase { public class NoriAnalysisTests extends ESTokenStreamTestCase {
@ -76,6 +77,22 @@ public class NoriAnalysisTests extends ESTokenStreamTestCase {
} }
public void testNoriAnalyzerUserDict() throws Exception { public void testNoriAnalyzerUserDict() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.my_analyzer.type", "nori")
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C샤프", "세종", "세종시 세종 시")
.build();
TestAnalysis analysis = createTestAnalysis(settings);
Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
try (TokenStream stream = analyzer.tokenStream("", "세종시")) {
assertTokenStreamContents(stream, new String[]{"세종", ""});
}
try (TokenStream stream = analyzer.tokenStream("", "c++world")) {
assertTokenStreamContents(stream, new String[]{"c++", "world"});
}
}
public void testNoriAnalyzerUserDictPath() throws Exception {
Settings settings = Settings.builder() Settings settings = Settings.builder()
.put("index.analysis.analyzer.my_analyzer.type", "nori") .put("index.analysis.analyzer.my_analyzer.type", "nori")
.put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt") .put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt")
@ -91,6 +108,17 @@ public class NoriAnalysisTests extends ESTokenStreamTestCase {
} }
} }
public void testNoriAnalyzerInvalidUserDictOption() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.my_analyzer.type", "nori")
.put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt")
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C샤프", "세종", "세종시 세종 시")
.build();
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
assertThat(exc.getMessage(), containsString("It is not allowed to use [user_dictionary] in conjunction " +
"with [user_dictionary_rules]"));
}
public void testNoriTokenizer() throws Exception { public void testNoriTokenizer() throws Exception {
Settings settings = Settings.builder() Settings settings = Settings.builder()
.put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer") .put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer")

View File

@ -221,10 +221,21 @@ public class Analysis {
* If the word list cannot be found at either key. * If the word list cannot be found at either key.
*/ */
public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) { public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) {
String wordListPath = settings.get(settingPrefix + "_path", null); return getWordList(env, settings, settingPrefix + "_path", settingPrefix);
}
/**
* Fetches a list of words from the specified settings file. The list should either be available at the key
* specified by <code>settingList</code> or in a file specified by <code>settingPath</code>.
*
* @throws IllegalArgumentException
* If the word list cannot be found at either key.
*/
public static List<String> getWordList(Environment env, Settings settings, String settingPath, String settingList) {
String wordListPath = settings.get(settingPath, null);
if (wordListPath == null) { if (wordListPath == null) {
List<String> explicitWordList = settings.getAsList(settingPrefix, null); List<String> explicitWordList = settings.getAsList(settingList, null);
if (explicitWordList == null) { if (explicitWordList == null) {
return null; return null;
} else { } else {
@ -238,11 +249,11 @@ public class Analysis {
return loadWordList(path, "#"); return loadWordList(path, "#");
} catch (CharacterCodingException ex) { } catch (CharacterCodingException ex) {
String message = String.format(Locale.ROOT, String message = String.format(Locale.ROOT,
"Unsupported character encoding detected while reading %s_path: %s - files must be UTF-8 encoded", "Unsupported character encoding detected while reading %s: %s - files must be UTF-8 encoded",
settingPrefix, path.toString()); settingPath, path.toString());
throw new IllegalArgumentException(message, ex); throw new IllegalArgumentException(message, ex);
} catch (IOException ioe) { } catch (IOException ioe) {
String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, path.toString()); String message = String.format(Locale.ROOT, "IOException while reading %s: %s", settingPath, path.toString());
throw new IllegalArgumentException(message, ioe); throw new IllegalArgumentException(message, ioe);
} }
} }