Fixed various related to config and user dictionaries for Kuromoji (SOLR-3276)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1306476 13f79535-47bb-0310-9956-ffa450edef68
2012-03-28 17:20:48 +00:00 · 2012-03-28 17:20:48 +00:00 · ec18632428
parent 0ced8b5020
commit ec18632428
9 changed files with 65 additions and 24 deletions
--- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
+++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
@ -99,8 +99,9 @@ public final class UserDictionary implements Dictionary {
      String pos = values[3];
      if (segmentation.length != readings.length) {
-        // FIXME: Should probably deal with this differently.  Exception?
+        throw new RuntimeException("Illegal user dictionary entry " + values[0] +
-        System.out.println("This entry is not properly formatted : " + line);
+                                   " - the number of segmentations (" + segmentation.length + ")" +
                                   " does not the match number of readings (" + readings.length + ")");
      }
      int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
--- a/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt
+++ b/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt
@ -1,5 +1,5 @@
 #
-# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
+# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
 #
 # Any token with a part-of-speech tag that exactly matches those defined in this
 # file are removed from the token stream.
@ -417,4 +417,4 @@
 #  unknown: unknown part of speech.
 #未知語
 #
-##### End of file
+##### End of file
--- a/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt
+++ b/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt
@ -6,7 +6,7 @@
 # for frequency lists, etc. that can be useful for making your own set (if desired)
 #
 # Note that there is an overlap between these stopwords and the terms stopped when used
-# in combination with the KuromojiPartOfSpeechStopFilter.  When editing this file, note
+# in combination with the JapanesePartOfSpeechStopFilter.  When editing this file, note
 # that comments are not allowed on the same line as stopwords.
 #
 # Also note that stopping is done in a case-insensitive manner.  Change your StopFilter
--- a/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
@ -42,8 +42,8 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.JapaneseTokenizerFactory"
 *       mode=NORMAL
- *       user-dictionary=user.txt
+ *       userDictionary=user.txt
- *       user-dictionary-encoding=UTF-8
+ *       userDictionaryEncoding=UTF-8
 *     /&gt;
 *     &lt;filter class="solr.JapaneseBaseFormFilterFactory"/&gt;
 *   &lt;/analyzer&gt;
@ -53,9 +53,9 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
 public class JapaneseTokenizerFactory extends BaseTokenizerFactory implements ResourceLoaderAware {
  private static final String MODE = "mode";
-  private static final String USER_DICT_PATH = "user-dictionary";
+  private static final String USER_DICT_PATH = "userDictionary";
-  private static final String USER_DICT_ENCODING = "user-dictionary-encoding";
+  private static final String USER_DICT_ENCODING = "userDictionaryEncoding";
  private UserDictionary userDictionary;
  private Mode mode;
--- a/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
@ -78,7 +78,7 @@ public class TestJapaneseTokenizerFactory extends BaseTokenTestCase {
        "朝青龍,朝青龍,アサショウリュウ,カスタム人名\n";
    JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
    Map<String,String> args = new HashMap<String,String>();
-    args.put("user-dictionary", "userdict.txt");
+    args.put("userDictionary", "userdict.txt");
    factory.init(args);
    factory.inform(new StringMockSolrResourceLoader(userDict));
    TokenStream ts = factory.create(new StringReader("関西国際空港に行った"));
--- a/solr/example/solr/conf/lang/stoptags_ja.txt
+++ b/solr/example/solr/conf/lang/stoptags_ja.txt
@ -1,5 +1,5 @@
 #
-# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
+# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
 #
 # Any token with a part-of-speech tag that exactly matches those defined in this
 # file are removed from the token stream.
@ -417,4 +417,4 @@
 #  unknown: unknown part of speech.
 #未知語
 #
-##### End of file
+##### End of file
--- a/solr/example/solr/conf/lang/stopwords_ja.txt
+++ b/solr/example/solr/conf/lang/stopwords_ja.txt
@ -6,7 +6,7 @@
 # for frequency lists, etc. that can be useful for making your own set (if desired)
 #
 # Note that there is an overlap between these stopwords and the terms stopped when used
-# in combination with the KuromojiPartOfSpeechStopFilter.  When editing this file, note
+# in combination with the JapanesePartOfSpeechStopFilter.  When editing this file, note
 # that comments are not allowed on the same line as stopwords.
 #
 # Also note that stopping is done in a case-insensitive manner.  Change your StopFilter
--- a/solr/example/solr/conf/lang/userdict_ja.txt
+++ b/solr/example/solr/conf/lang/userdict_ja.txt
@ -0,0 +1,29 @@
 #
 # This is a sample user dictionary for Kuromoji (JapaneseTokenizer)
 #
 # Add entries to this file in order to override the statistical model in terms
 # of segmentation, readings and part-of-speech tags.  Notice that entries do
 # not have weights since they are always used when found.  This is by-design
 # in order to maximize ease-of-use.
 #
 # Entries are defined using the following CSV format:
 #  <text>,<token 1> ... <token n>,<reading 1> ... <reading n>,<part-of-speech tag>
 #
 # Notice that a single half-width space separates tokens and readings, and
 # that the number tokens and readings must match exactly.
 #
 # Also notice that multiple entries with the same <text> is undefined.
 #
 # Whitespace only lines are ignored.  Comments are not allowed on entry lines.
 #
 # Custom segmentation for kanji compounds
 日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞
 # Custom segmentation for compound katakana
 トートバッグ,トート バッグ,トート バッグ,かずカナ名詞
 ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞
 # Custom reading for former sumo wrestler
 朝青龍,朝青龍,アサショウリュウ,カスタム人名
--- a/solr/example/solr/conf/schema.xml
+++ b/solr/example/solr/conf/schema.xml
@ -709,24 +709,35 @@
    -->
    <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
      <analyzer>
-      <!-- Kuromoji Japanese morphological analyzer/tokenizer.
+      <!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer)
-           Uses a search-mode (heuristic) to get a noun-decompounding effect that is useful for search.
+           Kuromoji has a search mode (default) that does segmentation useful for search.  A heuristic
-           
+           is used to segment compounds into its parts and the compound itself is kept as synonym.
           Example:
             関西国際空港 (Kansai International Airpart) becomes 関西 (Kansai) 国際 (International) 空港 (airport)
             so we get a match when searching for 空港 (airport) as we would expect from a good search engine.
             (With regular segmentation 関西国際空港 becomes one word and we don't get a hit.)
-           Valid values for mode are:
+           Valid values for attribute mode are:
              normal: regular segmentation
-              search: segmentation useful for search with extra splitting (default)
+              search: segmentation useful for search with synonyms compounds (default)
            extended: same as search mode, but unigrams unknown words (experimental)
-           NOTE: Search-mode improves segmentation for search at the expense of part-of-speech and reading accuracy
+           For some applications it might be good to use search mode for indexing and normal mode for
           queries to reduce recall and prevent parts of compounds from being matched and highlighted.
           Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query.
           Kuromoji also has a convenient user dictionary feature that allows overriding the statistical
           model with your own entries for segmentation, part-of-speech tags and readings without a need
           to specify weights.  Notice that user dictionaries have not been subject to extensive testing.
           User dictionary attributes are:
                     userDictionary: user dictionary filename
             userDictionaryEncoding: user dictionary encoding (default is UTF-8)
           See lang/userdict_ja.txt for a sample user dictionary file.
           See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
        -->
        <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
-        <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->	
+        <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>-->
        <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
        <filter class="solr.JapaneseBaseFormFilterFactory"/>
        <!-- Removes tokens with certain part-of-speech tags -->
        <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/>