Expose discard_compound_token option to kuromoji_tokenizer (#57421)
This commit exposes the new Lucene option `discard_compound_token` to the Elasticsearch Kuromoji plugin.
This commit is contained in:
parent
0e57528d5d
commit
c75c8b6e9d
|
@ -70,7 +70,7 @@ unknown words. It can be set to:
|
|||
|
||||
Extended mode outputs unigrams for unknown words. Example output:
|
||||
|
||||
関西, 国際, 空港
|
||||
関西, 関西国際空港, 国際, 空港
|
||||
ア, ブ, ラ, カ, ダ, ブ, ラ
|
||||
--
|
||||
|
||||
|
@ -208,6 +208,12 @@ The above `analyze` request returns the following:
|
|||
}
|
||||
--------------------------------------------------
|
||||
|
||||
`discard_compound_token`::
|
||||
Whether original compound tokens should be discarded from the output with `search` mode. Defaults to `false`.
|
||||
Example output with `search` or `extended` mode and this option `true`:
|
||||
|
||||
関西, 国際, 空港
|
||||
|
||||
|
||||
[[analysis-kuromoji-baseform]]
|
||||
==== `kuromoji_baseform` token filter
|
||||
|
|
|
@ -41,21 +41,24 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
|
|||
private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules";
|
||||
private static final String NBEST_COST = "nbest_cost";
|
||||
private static final String NBEST_EXAMPLES = "nbest_examples";
|
||||
private static final String DISCARD_COMPOUND_TOKEN = "discard_compound_token";
|
||||
|
||||
private final UserDictionary userDictionary;
|
||||
private final Mode mode;
|
||||
private final String nBestExamples;
|
||||
private final int nBestCost;
|
||||
|
||||
private boolean discartPunctuation;
|
||||
private boolean discardPunctuation;
|
||||
private boolean discardCompoundToken;
|
||||
|
||||
public KuromojiTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, settings, name);
|
||||
mode = getMode(settings);
|
||||
userDictionary = getUserDictionary(env, settings);
|
||||
discartPunctuation = settings.getAsBoolean("discard_punctuation", true);
|
||||
discardPunctuation = settings.getAsBoolean("discard_punctuation", true);
|
||||
nBestCost = settings.getAsInt(NBEST_COST, -1);
|
||||
nBestExamples = settings.get(NBEST_EXAMPLES);
|
||||
discardCompoundToken = settings.getAsBoolean(DISCARD_COMPOUND_TOKEN, false);
|
||||
}
|
||||
|
||||
public static UserDictionary getUserDictionary(Environment env, Settings settings) {
|
||||
|
@ -108,7 +111,7 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
|
|||
|
||||
@Override
|
||||
public Tokenizer create() {
|
||||
JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
|
||||
JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discardPunctuation, discardCompoundToken, mode);
|
||||
int nBestCost = this.nBestCost;
|
||||
if (nBestExamples != null) {
|
||||
nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples));
|
||||
|
|
|
@ -348,6 +348,17 @@ public class KuromojiAnalysisTests extends ESTestCase {
|
|||
assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [3]"));
|
||||
}
|
||||
|
||||
public void testDiscardCompoundToken() throws Exception {
|
||||
TestAnalysis analysis = createTestAnalysis();
|
||||
TokenizerFactory tokenizerFactory = analysis.tokenizer.get("kuromoji_discard_compound_token");
|
||||
String source = "株式会社";
|
||||
String[] expected = new String[] {"株式", "会社"};
|
||||
|
||||
Tokenizer tokenizer = tokenizerFactory.create();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertSimpleTSOutput(tokenizer, expected);
|
||||
}
|
||||
|
||||
private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException {
|
||||
InputStream dict = KuromojiAnalysisTests.class.getResourceAsStream("user_dict.txt");
|
||||
Path home = createTempDir();
|
||||
|
|
|
@ -60,6 +60,10 @@
|
|||
"type": "kuromoji_tokenizer",
|
||||
"nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/",
|
||||
"nbest_cost" : "1000"
|
||||
},
|
||||
"kuromoji_discard_compound_token": {
|
||||
"type": "kuromoji_tokenizer",
|
||||
"discard_compound_token": true
|
||||
}
|
||||
},
|
||||
"analyzer" : {
|
||||
|
@ -68,7 +72,7 @@
|
|||
"tokenizer" : "kuromoji_tokenizer"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue