Fixed various related to config and user dictionaries for Kuromoji (SOLR-3276)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1306476 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christian Moen 2012-03-28 17:20:48 +00:00
parent 0ced8b5020
commit ec18632428
9 changed files with 65 additions and 24 deletions

View File

@ -99,8 +99,9 @@ public final class UserDictionary implements Dictionary {
String pos = values[3];
if (segmentation.length != readings.length) {
// FIXME: Should probably deal with this differently. Exception?
System.out.println("This entry is not properly formatted : " + line);
throw new RuntimeException("Illegal user dictionary entry " + values[0] +
" - the number of segmentations (" + segmentation.length + ")" +
" does not the match number of readings (" + readings.length + ")");
}
int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....

View File

@ -1,5 +1,5 @@
#
# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
#
# Any token with a part-of-speech tag that exactly matches those defined in this
# file are removed from the token stream.
@ -417,4 +417,4 @@
# unknown: unknown part of speech.
#未知語
#
##### End of file
##### End of file

View File

@ -6,7 +6,7 @@
# for frequency lists, etc. that can be useful for making your own set (if desired)
#
# Note that there is an overlap between these stopwords and the terms stopped when used
# in combination with the KuromojiPartOfSpeechStopFilter. When editing this file, note
# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note
# that comments are not allowed on the same line as stopwords.
#
# Also note that stopping is done in a case-insensitive manner. Change your StopFilter

View File

@ -42,8 +42,8 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
* <analyzer>
* <tokenizer class="solr.JapaneseTokenizerFactory"
* mode=NORMAL
* user-dictionary=user.txt
* user-dictionary-encoding=UTF-8
* userDictionary=user.txt
* userDictionaryEncoding=UTF-8
* />
* <filter class="solr.JapaneseBaseFormFilterFactory"/>
* </analyzer>
@ -53,9 +53,9 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
public class JapaneseTokenizerFactory extends BaseTokenizerFactory implements ResourceLoaderAware {
private static final String MODE = "mode";
private static final String USER_DICT_PATH = "user-dictionary";
private static final String USER_DICT_PATH = "userDictionary";
private static final String USER_DICT_ENCODING = "user-dictionary-encoding";
private static final String USER_DICT_ENCODING = "userDictionaryEncoding";
private UserDictionary userDictionary;
private Mode mode;

View File

@ -78,7 +78,7 @@ public class TestJapaneseTokenizerFactory extends BaseTokenTestCase {
"朝青龍,朝青龍,アサショウリュウ,カスタム人名\n";
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("user-dictionary", "userdict.txt");
args.put("userDictionary", "userdict.txt");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(userDict));
TokenStream ts = factory.create(new StringReader("関西国際空港に行った"));

View File

@ -1,5 +1,5 @@
#
# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
#
# Any token with a part-of-speech tag that exactly matches those defined in this
# file are removed from the token stream.
@ -417,4 +417,4 @@
# unknown: unknown part of speech.
#未知語
#
##### End of file
##### End of file

View File

@ -6,7 +6,7 @@
# for frequency lists, etc. that can be useful for making your own set (if desired)
#
# Note that there is an overlap between these stopwords and the terms stopped when used
# in combination with the KuromojiPartOfSpeechStopFilter. When editing this file, note
# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note
# that comments are not allowed on the same line as stopwords.
#
# Also note that stopping is done in a case-insensitive manner. Change your StopFilter

View File

@ -0,0 +1,29 @@
#
# This is a sample user dictionary for Kuromoji (JapaneseTokenizer)
#
# Add entries to this file in order to override the statistical model in terms
# of segmentation, readings and part-of-speech tags. Notice that entries do
# not have weights since they are always used when found. This is by-design
# in order to maximize ease-of-use.
#
# Entries are defined using the following CSV format:
# <text>,<token 1> ... <token n>,<reading 1> ... <reading n>,<part-of-speech tag>
#
# Notice that a single half-width space separates tokens and readings, and
# that the number tokens and readings must match exactly.
#
# Also notice that multiple entries with the same <text> is undefined.
#
# Whitespace only lines are ignored. Comments are not allowed on entry lines.
#
# Custom segmentation for kanji compounds
日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞
# Custom segmentation for compound katakana
トートバッグ,トート バッグ,トート バッグ,かずカナ名詞
ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞
# Custom reading for former sumo wrestler
朝青龍,朝青龍,アサショウリュウ,カスタム人名

View File

@ -709,24 +709,35 @@
-->
<fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
<analyzer>
<!-- Kuromoji Japanese morphological analyzer/tokenizer.
<!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer)
Uses a search-mode (heuristic) to get a noun-decompounding effect that is useful for search.
Example:
関西国際空港 (Kansai International Airpart) becomes 関西 (Kansai) 国際 (International) 空港 (airport)
so we get a match when searching for 空港 (airport) as we would expect from a good search engine.
(With regular segmentation 関西国際空港 becomes one word and we don't get a hit.)
Kuromoji has a search mode (default) that does segmentation useful for search. A heuristic
is used to segment compounds into its parts and the compound itself is kept as synonym.
Valid values for mode are:
Valid values for attribute mode are:
normal: regular segmentation
search: segmentation useful for search with extra splitting (default)
search: segmentation useful for search with synonyms compounds (default)
extended: same as search mode, but unigrams unknown words (experimental)
NOTE: Search-mode improves segmentation for search at the expense of part-of-speech and reading accuracy
For some applications it might be good to use search mode for indexing and normal mode for
queries to reduce recall and prevent parts of compounds from being matched and highlighted.
Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query.
Kuromoji also has a convenient user dictionary feature that allows overriding the statistical
model with your own entries for segmentation, part-of-speech tags and readings without a need
to specify weights. Notice that user dictionaries have not been subject to extensive testing.
User dictionary attributes are:
userDictionary: user dictionary filename
userDictionaryEncoding: user dictionary encoding (default is UTF-8)
See lang/userdict_ja.txt for a sample user dictionary file.
See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
-->
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
<!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
<!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>-->
<!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
<filter class="solr.JapaneseBaseFormFilterFactory"/>
<!-- Removes tokens with certain part-of-speech tags -->
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/>