Add support for inlined user dictionary in the Kuromoji plugin (#45489)

This change adds a new option called user_dictionary_rules to
Kuromoji's tokenizer. It can be used to set additional tokenization rules
to the Japanese tokenizer directly in the settings (instead of using a file).
This commit also adds a check that no rules are duplicated since this is not allowed
in the UserDictionary.

Closes #25343
This commit is contained in:
Jim Ferenczi 2019-08-20 15:06:01 +02:00 committed by jimczi
parent 3318c91fea
commit fe2a7523ec
7 changed files with 126 additions and 17 deletions

View File

@ -98,6 +98,39 @@ dictionary to `$ES_HOME/config/userdict_ja.txt`:
東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
-----------------------
--
You can also inline the rules directly in the tokenizer definition using
the `user_dictionary_rules` option:
[source,js]
--------------------------------------------------
PUT nori_sample
{
"settings": {
"index": {
"analysis": {
"tokenizer": {
"kuromoji_user_dict": {
"type": "kuromoji_tokenizer",
"mode": "extended",
"user_dictionary_rules": ["東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞"]
}
},
"analyzer": {
"my_analyzer": {
"type": "custom",
"tokenizer": "kuromoji_user_dict"
}
}
}
}
}
}
--------------------------------------------------
// CONSOLE
--
`nbest_cost`/`nbest_examples`::
+
--

View File

@ -23,17 +23,22 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.apache.lucene.analysis.ja.util.CSVUtil;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
private static final String USER_DICT_OPTION = "user_dictionary";
private static final String USER_DICT_PATH_OPTION = "user_dictionary";
private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules";
private static final String NBEST_COST = "nbest_cost";
private static final String NBEST_EXAMPLES = "nbest_examples";
@ -54,17 +59,33 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
}
public static UserDictionary getUserDictionary(Environment env, Settings settings) {
if (settings.get(USER_DICT_PATH_OPTION) != null && settings.get(USER_DICT_RULES_OPTION) != null) {
throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" +
" with [" + USER_DICT_RULES_OPTION + "]");
}
try {
final Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION);
if (reader == null) {
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, false);
if (ruleList == null || ruleList.isEmpty()) {
return null;
} else {
try {
return UserDictionary.open(reader);
} finally {
reader.close();
}
}
Set<String> dup = new HashSet<>();
int lineNum = 0;
for (String line : ruleList) {
// ignore comments
if (line.startsWith("#") == false) {
String[] values = CSVUtil.parse(line);
if (dup.add(values[0]) == false) {
throw new IllegalArgumentException("Found duplicate term [" + values[0] + "] in user dictionary " +
"at line [" + lineNum + "]");
}
}
++ lineNum;
}
StringBuilder sb = new StringBuilder();
for (String line : ruleList) {
sb.append(line).append(System.lineSeparator());
}
return UserDictionary.open(new StringReader(sb.toString()));
} catch (IOException e) {
throw new ElasticsearchException("failed to load kuromoji user dictionary", e);
}

View File

@ -19,6 +19,7 @@
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
@ -39,6 +40,8 @@ import java.io.StringReader;
import java.nio.file.Files;
import java.nio.file.Path;
import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.instanceOf;
@ -307,4 +310,55 @@ public class KuromojiAnalysisTests extends ESTestCase {
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
public void testKuromojiAnalyzerUserDict() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++,c++,w,w", "制限スピード,制限スピード,セイゲンスピード,テスト名詞")
.build();
TestAnalysis analysis = createTestAnalysis(settings);
Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
try (TokenStream stream = analyzer.tokenStream("", "制限スピード")) {
assertTokenStreamContents(stream, new String[]{"制限スピード"});
}
try (TokenStream stream = analyzer.tokenStream("", "c++world")) {
assertTokenStreamContents(stream, new String[]{"c++", "world"});
}
}
public void testKuromojiAnalyzerInvalidUserDictOption() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
.put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt")
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++,c++,w,w")
.build();
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
assertThat(exc.getMessage(), containsString("It is not allowed to use [user_dictionary] in conjunction " +
"with [user_dictionary_rules]"));
}
public void testKuromojiAnalyzerDuplicateUserDictRule() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules",
"c++,c++,w,w", "#comment", "制限スピード,制限スピード,セイゲンスピード,テスト名詞", "制限スピード,制限スピード,セイゲンスピード,テスト名詞")
.build();
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [3]"));
}
private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException {
InputStream dict = KuromojiAnalysisTests.class.getResourceAsStream("user_dict.txt");
Path home = createTempDir();
Path config = home.resolve("config");
Files.createDirectory(config);
Files.copy(dict, config.resolve("user_dict.txt"));
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put(Environment.PATH_HOME_SETTING.getKey(), home)
.put(analysisSettings)
.build();
return AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new AnalysisKuromojiPlugin());
}
}

View File

@ -51,7 +51,7 @@ public class NoriTokenizerFactory extends AbstractTokenizerFactory {
throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" +
" with [" + USER_DICT_RULES_OPTION + "]");
}
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION);
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, true);
StringBuilder sb = new StringBuilder();
if (ruleList == null || ruleList.isEmpty()) {
return null;

View File

@ -221,7 +221,7 @@ public class Analysis {
* If the word list cannot be found at either key.
*/
public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) {
return getWordList(env, settings, settingPrefix + "_path", settingPrefix);
return getWordList(env, settings, settingPrefix + "_path", settingPrefix, true);
}
/**
@ -231,7 +231,8 @@ public class Analysis {
* @throws IllegalArgumentException
* If the word list cannot be found at either key.
*/
public static List<String> getWordList(Environment env, Settings settings, String settingPath, String settingList) {
public static List<String> getWordList(Environment env, Settings settings,
String settingPath, String settingList, boolean removeComments) {
String wordListPath = settings.get(settingPath, null);
if (wordListPath == null) {
@ -246,7 +247,7 @@ public class Analysis {
final Path path = env.configFile().resolve(wordListPath);
try {
return loadWordList(path, "#");
return loadWordList(path, removeComments);
} catch (CharacterCodingException ex) {
String message = String.format(Locale.ROOT,
"Unsupported character encoding detected while reading %s: %s - files must be UTF-8 encoded",
@ -258,15 +259,15 @@ public class Analysis {
}
}
private static List<String> loadWordList(Path path, String comment) throws IOException {
private static List<String> loadWordList(Path path, boolean removeComments) throws IOException {
final List<String> result = new ArrayList<>();
try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
String word;
while ((word = br.readLine()) != null) {
if (!Strings.hasText(word)) {
if (Strings.hasText(word) == false) {
continue;
}
if (!word.startsWith(comment)) {
if (removeComments == false || word.startsWith("#") == false) {
result.add(word.trim());
}
}