From 330911389a77957a61cabb890140f86781d45c93 Mon Sep 17 00:00:00 2001 From: Jun Ohtani Date: Thu, 14 May 2015 13:41:44 +0900 Subject: [PATCH] Add user dictionary test case Closes #59 --- README.md | 2 +- pom.xml | 1 + .../index/analysis/KuromojiAnalysisTests.java | 20 +++++++++++++++++++ .../index/analysis/empty_user_dict.txt | 0 .../index/analysis/kuromoji_analysis.json | 11 ++++++++-- .../index/analysis/user_dict.txt | 1 + 6 files changed, 32 insertions(+), 3 deletions(-) create mode 100644 src/test/java/org/elasticsearch/index/analysis/empty_user_dict.txt create mode 100644 src/test/java/org/elasticsearch/index/analysis/user_dict.txt diff --git a/README.md b/README.md index 65e37c88b52..42346cb31b2 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ The following are settings that can be set for a `kuromoji_tokenizer` tokenizer |:--------------------|:--------------------------------------------------------------------------------------------------------------------------|:------------------| | mode | Tokenization mode: this determines how the tokenizer handles compound and unknown words. `normal` and `search`, `extended`| `search` | | discard_punctuation | `true` if punctuation tokens should be dropped from the output. | `true` | -| user_dict | set User Dictionary file | | +| user_dictionary | set User Dictionary file | | ### Tokenization mode diff --git a/pom.xml b/pom.xml index 1845da9de14..447e278d287 100644 --- a/pom.xml +++ b/pom.xml @@ -87,6 +87,7 @@ src/test/java **/*.json + **/*.txt diff --git a/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java index 73fd78047bf..02749c7c952 100644 --- a/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java @@ -245,4 +245,24 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase { return buffer.toString(); } + @Test + public void testKuromojiUserDict() throws IOException { + AnalysisService analysisService = createAnalysisService(); + TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_user_dict"); + String source = "私は制限スピードを超える。"; + String[] expected = new String[]{"私", "は", "制限スピード", "を", "超える"}; + + Tokenizer tokenizer = tokenizerFactory.create(); + tokenizer.setReader(new StringReader(source)); + assertSimpleTSOutput(tokenizer, expected); + } + + // fix #59 + @Test + public void testKuromojiEmptyUserDict() { + AnalysisService analysisService = createAnalysisService(); + TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_empty_user_dict"); + assertThat(tokenizerFactory, instanceOf(KuromojiTokenizerFactory.class)); + } + } diff --git a/src/test/java/org/elasticsearch/index/analysis/empty_user_dict.txt b/src/test/java/org/elasticsearch/index/analysis/empty_user_dict.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json b/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json index 7642e756ceb..a36b4ae2197 100644 --- a/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json +++ b/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json @@ -39,9 +39,16 @@ "tokenizer" : { "kuromoji" : { - "type":"kuromoji_tokenizer" + "type":"kuromoji_tokenizer" + }, + "kuromoji_empty_user_dict" : { + "type":"kuromoji_tokenizer", + "user_dictionary":"org/elasticsearch/index/analysis/empty_user_dict.txt" + }, + "kuromoji_user_dict" : { + "type":"kuromoji_tokenizer", + "user_dictionary":"org/elasticsearch/index/analysis/user_dict.txt" } - }, "analyzer" : { "my_analyzer" : { diff --git a/src/test/java/org/elasticsearch/index/analysis/user_dict.txt b/src/test/java/org/elasticsearch/index/analysis/user_dict.txt new file mode 100644 index 00000000000..54b59d66130 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/user_dict.txt @@ -0,0 +1 @@ +制限スピード,制限スピード,セイゲンスピード,テスト名詞