Fixed various related to config and user dictionaries for Kuromoji (SOLR-3276)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1306476 13f79535-47bb-0310-9956-ffa450edef68
2012-03-28 17:20:48 +00:00 · 2012-03-28 17:20:48 +00:00 · ec18632428
parent 0ced8b5020
commit ec18632428
9 changed files with 65 additions and 24 deletions
--- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
+++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
@ -99,8 +99,9 @@ public final class UserDictionary implements Dictionary {
      String pos = values[3];
      
      if (segmentation.length != readings.length) {
-        // FIXME: Should probably deal with this differently.  Exception?
-        System.out.println("This entry is not properly formatted : " + line);
+        throw new RuntimeException("Illegal user dictionary entry " + values[0] +
+                                   " - the number of segmentations (" + segmentation.length + ")" +
+                                   " does not the match number of readings (" + readings.length + ")");
      }
      
      int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
--- a/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt
+++ b/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt
@ -1,5 +1,5 @@
 #
-# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
+# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
 #
 # Any token with a part-of-speech tag that exactly matches those defined in this
 # file are removed from the token stream.
--- a/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt
+++ b/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt
@ -6,7 +6,7 @@
 # for frequency lists, etc. that can be useful for making your own set (if desired)
 #
 # Note that there is an overlap between these stopwords and the terms stopped when used
-# in combination with the KuromojiPartOfSpeechStopFilter.  When editing this file, note
+# in combination with the JapanesePartOfSpeechStopFilter.  When editing this file, note
 # that comments are not allowed on the same line as stopwords.
 #
 # Also note that stopping is done in a case-insensitive manner.  Change your StopFilter
--- a/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
@ -42,8 +42,8 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.JapaneseTokenizerFactory"
 *       mode=NORMAL
- *       user-dictionary=user.txt
- *       user-dictionary-encoding=UTF-8
+ *       userDictionary=user.txt
+ *       userDictionaryEncoding=UTF-8
 *     /&gt;
 *     &lt;filter class="solr.JapaneseBaseFormFilterFactory"/&gt;
 *   &lt;/analyzer&gt;
@ -53,9 +53,9 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
 public class JapaneseTokenizerFactory extends BaseTokenizerFactory implements ResourceLoaderAware {
  private static final String MODE = "mode";
  
-  private static final String USER_DICT_PATH = "user-dictionary";
+  private static final String USER_DICT_PATH = "userDictionary";
  
-  private static final String USER_DICT_ENCODING = "user-dictionary-encoding";
+  private static final String USER_DICT_ENCODING = "userDictionaryEncoding";

  private UserDictionary userDictionary;
  private Mode mode;
--- a/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
@ -78,7 +78,7 @@ public class TestJapaneseTokenizerFactory extends BaseTokenTestCase {
        "朝青龍,朝青龍,アサショウリュウ,カスタム人名\n";
    JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
    Map<String,String> args = new HashMap<String,String>();
-    args.put("user-dictionary", "userdict.txt");
+    args.put("userDictionary", "userdict.txt");
    factory.init(args);
    factory.inform(new StringMockSolrResourceLoader(userDict));
    TokenStream ts = factory.create(new StringReader("関西国際空港に行った"));
--- a/solr/example/solr/conf/lang/stoptags_ja.txt
+++ b/solr/example/solr/conf/lang/stoptags_ja.txt
@ -1,5 +1,5 @@
 #
-# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
+# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
 #
 # Any token with a part-of-speech tag that exactly matches those defined in this
 # file are removed from the token stream.
--- a/solr/example/solr/conf/lang/stopwords_ja.txt
+++ b/solr/example/solr/conf/lang/stopwords_ja.txt
@ -6,7 +6,7 @@
 # for frequency lists, etc. that can be useful for making your own set (if desired)
 #
 # Note that there is an overlap between these stopwords and the terms stopped when used
-# in combination with the KuromojiPartOfSpeechStopFilter.  When editing this file, note
+# in combination with the JapanesePartOfSpeechStopFilter.  When editing this file, note
 # that comments are not allowed on the same line as stopwords.
 #
 # Also note that stopping is done in a case-insensitive manner.  Change your StopFilter
--- a/solr/example/solr/conf/lang/userdict_ja.txt
+++ b/solr/example/solr/conf/lang/userdict_ja.txt
@ -0,0 +1,29 @@
+#
+# This is a sample user dictionary for Kuromoji (JapaneseTokenizer)
+#
+# Add entries to this file in order to override the statistical model in terms
+# of segmentation, readings and part-of-speech tags.  Notice that entries do
+# not have weights since they are always used when found.  This is by-design
+# in order to maximize ease-of-use.
+#
+# Entries are defined using the following CSV format:
+#  <text>,<token 1> ... <token n>,<reading 1> ... <reading n>,<part-of-speech tag>
+#
+# Notice that a single half-width space separates tokens and readings, and
+# that the number tokens and readings must match exactly.
+#
+# Also notice that multiple entries with the same <text> is undefined.
+#
+# Whitespace only lines are ignored.  Comments are not allowed on entry lines.
+#
+
+# Custom segmentation for kanji compounds
+日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
+関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞
+
+# Custom segmentation for compound katakana
+トートバッグ,トート バッグ,トート バッグ,かずカナ名詞
+ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞
+
+# Custom reading for former sumo wrestler
+朝青龍,朝青龍,アサショウリュウ,カスタム人名
--- a/solr/example/solr/conf/schema.xml
+++ b/solr/example/solr/conf/schema.xml
@ -709,23 +709,34 @@
    -->
    <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
      <analyzer>
-      <!-- Kuromoji Japanese morphological analyzer/tokenizer.
+      <!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer)

-           Uses a search-mode (heuristic) to get a noun-decompounding effect that is useful for search.
+           Kuromoji has a search mode (default) that does segmentation useful for search.  A heuristic
+           is used to segment compounds into its parts and the compound itself is kept as synonym.

-           Example:
-             関西国際空港 (Kansai International Airpart) becomes 関西 (Kansai) 国際 (International) 空港 (airport)
-             so we get a match when searching for 空港 (airport) as we would expect from a good search engine.
-             (With regular segmentation 関西国際空港 becomes one word and we don't get a hit.)
-
-           Valid values for mode are:
+           Valid values for attribute mode are:
              normal: regular segmentation
-              search: segmentation useful for search with extra splitting (default)
+              search: segmentation useful for search with synonyms compounds (default)
            extended: same as search mode, but unigrams unknown words (experimental)

-           NOTE: Search-mode improves segmentation for search at the expense of part-of-speech and reading accuracy
+           For some applications it might be good to use search mode for indexing and normal mode for
+           queries to reduce recall and prevent parts of compounds from being matched and highlighted.
+           Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query.
+
+           Kuromoji also has a convenient user dictionary feature that allows overriding the statistical
+           model with your own entries for segmentation, part-of-speech tags and readings without a need
+           to specify weights.  Notice that user dictionaries have not been subject to extensive testing.
+
+           User dictionary attributes are:
+                     userDictionary: user dictionary filename
+             userDictionaryEncoding: user dictionary encoding (default is UTF-8)
+
+           See lang/userdict_ja.txt for a sample user dictionary file.
+
+           See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
        -->
        <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
+        <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>-->
        <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
        <filter class="solr.JapaneseBaseFormFilterFactory"/>
        <!-- Removes tokens with certain part-of-speech tags -->