mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-03-28 10:58:30 +00:00
Add support for inlined user dictionary in Nori (#36123)
Add support for inlined user dictionary in Nori This change adds a new option called `user_dictionary_rules` to the Nori a tokenizer`. It can be used to set additional tokenization rules to the Korean tokenizer directly in the settings (instead of using a file). Closes #35842
This commit is contained in:
parent
ca09936cdf
commit
a53e8653f2
@ -70,7 +70,7 @@ The first token is mandatory and represents the custom noun that should be added
|
|||||||
the dictionary. For compound nouns the custom segmentation can be provided
|
the dictionary. For compound nouns the custom segmentation can be provided
|
||||||
after the first token (`[<token 1> ... <token n>]`). The segmentation of the
|
after the first token (`[<token 1> ... <token n>]`). The segmentation of the
|
||||||
custom compound nouns is controlled by the `decompound_mode` setting.
|
custom compound nouns is controlled by the `decompound_mode` setting.
|
||||||
--
|
|
||||||
|
|
||||||
As a demonstration of how the user dictionary can be used, save the following
|
As a demonstration of how the user dictionary can be used, save the following
|
||||||
dictionary to `$ES_HOME/config/userdict_ko.txt`:
|
dictionary to `$ES_HOME/config/userdict_ko.txt`:
|
||||||
@ -153,6 +153,42 @@ The above `analyze` request returns the following:
|
|||||||
// TESTRESPONSE
|
// TESTRESPONSE
|
||||||
|
|
||||||
<1> This is a compound token that spans two positions (`mixed` mode).
|
<1> This is a compound token that spans two positions (`mixed` mode).
|
||||||
|
--
|
||||||
|
|
||||||
|
`user_dictionary_rules`::
|
||||||
|
+
|
||||||
|
--
|
||||||
|
|
||||||
|
You can also inline the rules directly in the tokenizer definition using
|
||||||
|
the `user_dictionary_rules` option:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
PUT nori_sample
|
||||||
|
{
|
||||||
|
"settings": {
|
||||||
|
"index": {
|
||||||
|
"analysis": {
|
||||||
|
"tokenizer": {
|
||||||
|
"nori_user_dict": {
|
||||||
|
"type": "nori_tokenizer",
|
||||||
|
"decompound_mode": "mixed",
|
||||||
|
"user_dictionary_rules": ["c++", "C샤프", "세종", "세종시 세종 시"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"analyzer": {
|
||||||
|
"my_analyzer": {
|
||||||
|
"type": "custom",
|
||||||
|
"tokenizer": "nori_user_dict"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
--
|
||||||
|
|
||||||
The `nori_tokenizer` sets a number of additional attributes per token that are used by token filters
|
The `nori_tokenizer` sets a number of additional attributes per token that are used by token filters
|
||||||
to modify the stream.
|
to modify the stream.
|
||||||
|
@ -29,10 +29,13 @@ import org.elasticsearch.index.IndexSettings;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
|
||||||
public class NoriTokenizerFactory extends AbstractTokenizerFactory {
|
public class NoriTokenizerFactory extends AbstractTokenizerFactory {
|
||||||
private static final String USER_DICT_OPTION = "user_dictionary";
|
private static final String USER_DICT_PATH_OPTION = "user_dictionary";
|
||||||
|
private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules";
|
||||||
|
|
||||||
private final UserDictionary userDictionary;
|
private final UserDictionary userDictionary;
|
||||||
private final KoreanTokenizer.DecompoundMode decompoundMode;
|
private final KoreanTokenizer.DecompoundMode decompoundMode;
|
||||||
@ -44,12 +47,20 @@ public class NoriTokenizerFactory extends AbstractTokenizerFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static UserDictionary getUserDictionary(Environment env, Settings settings) {
|
public static UserDictionary getUserDictionary(Environment env, Settings settings) {
|
||||||
try (Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION)) {
|
if (settings.get(USER_DICT_PATH_OPTION) != null && settings.get(USER_DICT_RULES_OPTION) != null) {
|
||||||
if (reader == null) {
|
throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" +
|
||||||
return null;
|
" with [" + USER_DICT_RULES_OPTION + "]");
|
||||||
} else {
|
}
|
||||||
return UserDictionary.open(reader);
|
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION);
|
||||||
}
|
StringBuilder sb = new StringBuilder();
|
||||||
|
if (ruleList == null || ruleList.isEmpty()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
for (String line : ruleList) {
|
||||||
|
sb.append(line).append(System.lineSeparator());
|
||||||
|
}
|
||||||
|
try (Reader rulesReader = new StringReader(sb.toString())) {
|
||||||
|
return UserDictionary.open(rulesReader);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new ElasticsearchException("failed to load nori user dictionary", e);
|
throw new ElasticsearchException("failed to load nori user dictionary", e);
|
||||||
}
|
}
|
||||||
|
@ -38,6 +38,7 @@ import java.io.StringReader;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import static org.hamcrest.Matchers.containsString;
|
||||||
import static org.hamcrest.Matchers.instanceOf;
|
import static org.hamcrest.Matchers.instanceOf;
|
||||||
|
|
||||||
public class NoriAnalysisTests extends ESTokenStreamTestCase {
|
public class NoriAnalysisTests extends ESTokenStreamTestCase {
|
||||||
@ -76,6 +77,22 @@ public class NoriAnalysisTests extends ESTokenStreamTestCase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void testNoriAnalyzerUserDict() throws Exception {
|
public void testNoriAnalyzerUserDict() throws Exception {
|
||||||
|
Settings settings = Settings.builder()
|
||||||
|
.put("index.analysis.analyzer.my_analyzer.type", "nori")
|
||||||
|
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C샤프", "세종", "세종시 세종 시")
|
||||||
|
.build();
|
||||||
|
TestAnalysis analysis = createTestAnalysis(settings);
|
||||||
|
Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
|
||||||
|
try (TokenStream stream = analyzer.tokenStream("", "세종시")) {
|
||||||
|
assertTokenStreamContents(stream, new String[]{"세종", "시"});
|
||||||
|
}
|
||||||
|
|
||||||
|
try (TokenStream stream = analyzer.tokenStream("", "c++world")) {
|
||||||
|
assertTokenStreamContents(stream, new String[]{"c++", "world"});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNoriAnalyzerUserDictPath() throws Exception {
|
||||||
Settings settings = Settings.builder()
|
Settings settings = Settings.builder()
|
||||||
.put("index.analysis.analyzer.my_analyzer.type", "nori")
|
.put("index.analysis.analyzer.my_analyzer.type", "nori")
|
||||||
.put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt")
|
.put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt")
|
||||||
@ -91,6 +108,17 @@ public class NoriAnalysisTests extends ESTokenStreamTestCase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testNoriAnalyzerInvalidUserDictOption() throws Exception {
|
||||||
|
Settings settings = Settings.builder()
|
||||||
|
.put("index.analysis.analyzer.my_analyzer.type", "nori")
|
||||||
|
.put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt")
|
||||||
|
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C샤프", "세종", "세종시 세종 시")
|
||||||
|
.build();
|
||||||
|
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
|
||||||
|
assertThat(exc.getMessage(), containsString("It is not allowed to use [user_dictionary] in conjunction " +
|
||||||
|
"with [user_dictionary_rules]"));
|
||||||
|
}
|
||||||
|
|
||||||
public void testNoriTokenizer() throws Exception {
|
public void testNoriTokenizer() throws Exception {
|
||||||
Settings settings = Settings.builder()
|
Settings settings = Settings.builder()
|
||||||
.put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer")
|
.put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer")
|
||||||
|
@ -221,10 +221,21 @@ public class Analysis {
|
|||||||
* If the word list cannot be found at either key.
|
* If the word list cannot be found at either key.
|
||||||
*/
|
*/
|
||||||
public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) {
|
public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) {
|
||||||
String wordListPath = settings.get(settingPrefix + "_path", null);
|
return getWordList(env, settings, settingPrefix + "_path", settingPrefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetches a list of words from the specified settings file. The list should either be available at the key
|
||||||
|
* specified by <code>settingList</code> or in a file specified by <code>settingPath</code>.
|
||||||
|
*
|
||||||
|
* @throws IllegalArgumentException
|
||||||
|
* If the word list cannot be found at either key.
|
||||||
|
*/
|
||||||
|
public static List<String> getWordList(Environment env, Settings settings, String settingPath, String settingList) {
|
||||||
|
String wordListPath = settings.get(settingPath, null);
|
||||||
|
|
||||||
if (wordListPath == null) {
|
if (wordListPath == null) {
|
||||||
List<String> explicitWordList = settings.getAsList(settingPrefix, null);
|
List<String> explicitWordList = settings.getAsList(settingList, null);
|
||||||
if (explicitWordList == null) {
|
if (explicitWordList == null) {
|
||||||
return null;
|
return null;
|
||||||
} else {
|
} else {
|
||||||
@ -238,11 +249,11 @@ public class Analysis {
|
|||||||
return loadWordList(path, "#");
|
return loadWordList(path, "#");
|
||||||
} catch (CharacterCodingException ex) {
|
} catch (CharacterCodingException ex) {
|
||||||
String message = String.format(Locale.ROOT,
|
String message = String.format(Locale.ROOT,
|
||||||
"Unsupported character encoding detected while reading %s_path: %s - files must be UTF-8 encoded",
|
"Unsupported character encoding detected while reading %s: %s - files must be UTF-8 encoded",
|
||||||
settingPrefix, path.toString());
|
settingPath, path.toString());
|
||||||
throw new IllegalArgumentException(message, ex);
|
throw new IllegalArgumentException(message, ex);
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, path.toString());
|
String message = String.format(Locale.ROOT, "IOException while reading %s: %s", settingPath, path.toString());
|
||||||
throw new IllegalArgumentException(message, ioe);
|
throw new IllegalArgumentException(message, ioe);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user