Add support for inlined user dictionary in the Kuromoji plugin (#45489)
This change adds a new option called user_dictionary_rules to Kuromoji's tokenizer. It can be used to set additional tokenization rules to the Japanese tokenizer directly in the settings (instead of using a file). This commit also adds a check that no rules are duplicated since this is not allowed in the UserDictionary. Closes #25343
This commit is contained in:
parent
3318c91fea
commit
fe2a7523ec
|
@ -98,6 +98,39 @@ dictionary to `$ES_HOME/config/userdict_ja.txt`:
|
|||
東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
|
||||
-----------------------
|
||||
|
||||
--
|
||||
|
||||
You can also inline the rules directly in the tokenizer definition using
|
||||
the `user_dictionary_rules` option:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
PUT nori_sample
|
||||
{
|
||||
"settings": {
|
||||
"index": {
|
||||
"analysis": {
|
||||
"tokenizer": {
|
||||
"kuromoji_user_dict": {
|
||||
"type": "kuromoji_tokenizer",
|
||||
"mode": "extended",
|
||||
"user_dictionary_rules": ["東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞"]
|
||||
}
|
||||
},
|
||||
"analyzer": {
|
||||
"my_analyzer": {
|
||||
"type": "custom",
|
||||
"tokenizer": "kuromoji_user_dict"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
--
|
||||
|
||||
`nbest_cost`/`nbest_examples`::
|
||||
+
|
||||
--
|
||||
|
|
|
@ -23,17 +23,22 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
|
||||
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
|
||||
import org.apache.lucene.analysis.ja.dict.UserDictionary;
|
||||
import org.apache.lucene.analysis.ja.util.CSVUtil;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
private static final String USER_DICT_OPTION = "user_dictionary";
|
||||
private static final String USER_DICT_PATH_OPTION = "user_dictionary";
|
||||
private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules";
|
||||
private static final String NBEST_COST = "nbest_cost";
|
||||
private static final String NBEST_EXAMPLES = "nbest_examples";
|
||||
|
||||
|
@ -54,17 +59,33 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
|
|||
}
|
||||
|
||||
public static UserDictionary getUserDictionary(Environment env, Settings settings) {
|
||||
if (settings.get(USER_DICT_PATH_OPTION) != null && settings.get(USER_DICT_RULES_OPTION) != null) {
|
||||
throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" +
|
||||
" with [" + USER_DICT_RULES_OPTION + "]");
|
||||
}
|
||||
try {
|
||||
final Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION);
|
||||
if (reader == null) {
|
||||
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, false);
|
||||
if (ruleList == null || ruleList.isEmpty()) {
|
||||
return null;
|
||||
} else {
|
||||
try {
|
||||
return UserDictionary.open(reader);
|
||||
} finally {
|
||||
reader.close();
|
||||
}
|
||||
Set<String> dup = new HashSet<>();
|
||||
int lineNum = 0;
|
||||
for (String line : ruleList) {
|
||||
// ignore comments
|
||||
if (line.startsWith("#") == false) {
|
||||
String[] values = CSVUtil.parse(line);
|
||||
if (dup.add(values[0]) == false) {
|
||||
throw new IllegalArgumentException("Found duplicate term [" + values[0] + "] in user dictionary " +
|
||||
"at line [" + lineNum + "]");
|
||||
}
|
||||
}
|
||||
++ lineNum;
|
||||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String line : ruleList) {
|
||||
sb.append(line).append(System.lineSeparator());
|
||||
}
|
||||
return UserDictionary.open(new StringReader(sb.toString()));
|
||||
} catch (IOException e) {
|
||||
throw new ElasticsearchException("failed to load kuromoji user dictionary", e);
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
|
||||
|
@ -39,6 +40,8 @@ import java.io.StringReader;
|
|||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
|
||||
import static org.hamcrest.CoreMatchers.containsString;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.greaterThan;
|
||||
import static org.hamcrest.Matchers.instanceOf;
|
||||
|
@ -307,4 +310,55 @@ public class KuromojiAnalysisTests extends ESTestCase {
|
|||
tokenizer.setReader(new StringReader(source));
|
||||
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
|
||||
public void testKuromojiAnalyzerUserDict() throws Exception {
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
|
||||
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++,c++,w,w", "制限スピード,制限スピード,セイゲンスピード,テスト名詞")
|
||||
.build();
|
||||
TestAnalysis analysis = createTestAnalysis(settings);
|
||||
Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
|
||||
try (TokenStream stream = analyzer.tokenStream("", "制限スピード")) {
|
||||
assertTokenStreamContents(stream, new String[]{"制限スピード"});
|
||||
}
|
||||
|
||||
try (TokenStream stream = analyzer.tokenStream("", "c++world")) {
|
||||
assertTokenStreamContents(stream, new String[]{"c++", "world"});
|
||||
}
|
||||
}
|
||||
|
||||
public void testKuromojiAnalyzerInvalidUserDictOption() throws Exception {
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
|
||||
.put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt")
|
||||
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++,c++,w,w")
|
||||
.build();
|
||||
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
|
||||
assertThat(exc.getMessage(), containsString("It is not allowed to use [user_dictionary] in conjunction " +
|
||||
"with [user_dictionary_rules]"));
|
||||
}
|
||||
|
||||
public void testKuromojiAnalyzerDuplicateUserDictRule() throws Exception {
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
|
||||
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules",
|
||||
"c++,c++,w,w", "#comment", "制限スピード,制限スピード,セイゲンスピード,テスト名詞", "制限スピード,制限スピード,セイゲンスピード,テスト名詞")
|
||||
.build();
|
||||
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
|
||||
assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [3]"));
|
||||
}
|
||||
|
||||
private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException {
|
||||
InputStream dict = KuromojiAnalysisTests.class.getResourceAsStream("user_dict.txt");
|
||||
Path home = createTempDir();
|
||||
Path config = home.resolve("config");
|
||||
Files.createDirectory(config);
|
||||
Files.copy(dict, config.resolve("user_dict.txt"));
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), home)
|
||||
.put(analysisSettings)
|
||||
.build();
|
||||
return AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new AnalysisKuromojiPlugin());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -51,7 +51,7 @@ public class NoriTokenizerFactory extends AbstractTokenizerFactory {
|
|||
throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" +
|
||||
" with [" + USER_DICT_RULES_OPTION + "]");
|
||||
}
|
||||
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION);
|
||||
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, true);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
if (ruleList == null || ruleList.isEmpty()) {
|
||||
return null;
|
||||
|
|
|
@ -221,7 +221,7 @@ public class Analysis {
|
|||
* If the word list cannot be found at either key.
|
||||
*/
|
||||
public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) {
|
||||
return getWordList(env, settings, settingPrefix + "_path", settingPrefix);
|
||||
return getWordList(env, settings, settingPrefix + "_path", settingPrefix, true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -231,7 +231,8 @@ public class Analysis {
|
|||
* @throws IllegalArgumentException
|
||||
* If the word list cannot be found at either key.
|
||||
*/
|
||||
public static List<String> getWordList(Environment env, Settings settings, String settingPath, String settingList) {
|
||||
public static List<String> getWordList(Environment env, Settings settings,
|
||||
String settingPath, String settingList, boolean removeComments) {
|
||||
String wordListPath = settings.get(settingPath, null);
|
||||
|
||||
if (wordListPath == null) {
|
||||
|
@ -246,7 +247,7 @@ public class Analysis {
|
|||
final Path path = env.configFile().resolve(wordListPath);
|
||||
|
||||
try {
|
||||
return loadWordList(path, "#");
|
||||
return loadWordList(path, removeComments);
|
||||
} catch (CharacterCodingException ex) {
|
||||
String message = String.format(Locale.ROOT,
|
||||
"Unsupported character encoding detected while reading %s: %s - files must be UTF-8 encoded",
|
||||
|
@ -258,15 +259,15 @@ public class Analysis {
|
|||
}
|
||||
}
|
||||
|
||||
private static List<String> loadWordList(Path path, String comment) throws IOException {
|
||||
private static List<String> loadWordList(Path path, boolean removeComments) throws IOException {
|
||||
final List<String> result = new ArrayList<>();
|
||||
try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
|
||||
String word;
|
||||
while ((word = br.readLine()) != null) {
|
||||
if (!Strings.hasText(word)) {
|
||||
if (Strings.hasText(word) == false) {
|
||||
continue;
|
||||
}
|
||||
if (!word.startsWith(comment)) {
|
||||
if (removeComments == false || word.startsWith("#") == false) {
|
||||
result.add(word.trim());
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue