mirror of https://github.com/apache/lucene.git
Fixed various related to config and user dictionaries for Kuromoji (SOLR-3276)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1306476 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0ced8b5020
commit
ec18632428
|
@ -99,8 +99,9 @@ public final class UserDictionary implements Dictionary {
|
||||||
String pos = values[3];
|
String pos = values[3];
|
||||||
|
|
||||||
if (segmentation.length != readings.length) {
|
if (segmentation.length != readings.length) {
|
||||||
// FIXME: Should probably deal with this differently. Exception?
|
throw new RuntimeException("Illegal user dictionary entry " + values[0] +
|
||||||
System.out.println("This entry is not properly formatted : " + line);
|
" - the number of segmentations (" + segmentation.length + ")" +
|
||||||
|
" does not the match number of readings (" + readings.length + ")");
|
||||||
}
|
}
|
||||||
|
|
||||||
int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
|
int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#
|
#
|
||||||
# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
|
# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
|
||||||
#
|
#
|
||||||
# Any token with a part-of-speech tag that exactly matches those defined in this
|
# Any token with a part-of-speech tag that exactly matches those defined in this
|
||||||
# file are removed from the token stream.
|
# file are removed from the token stream.
|
||||||
|
@ -417,4 +417,4 @@
|
||||||
# unknown: unknown part of speech.
|
# unknown: unknown part of speech.
|
||||||
#未知語
|
#未知語
|
||||||
#
|
#
|
||||||
##### End of file
|
##### End of file
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
# for frequency lists, etc. that can be useful for making your own set (if desired)
|
# for frequency lists, etc. that can be useful for making your own set (if desired)
|
||||||
#
|
#
|
||||||
# Note that there is an overlap between these stopwords and the terms stopped when used
|
# Note that there is an overlap between these stopwords and the terms stopped when used
|
||||||
# in combination with the KuromojiPartOfSpeechStopFilter. When editing this file, note
|
# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note
|
||||||
# that comments are not allowed on the same line as stopwords.
|
# that comments are not allowed on the same line as stopwords.
|
||||||
#
|
#
|
||||||
# Also note that stopping is done in a case-insensitive manner. Change your StopFilter
|
# Also note that stopping is done in a case-insensitive manner. Change your StopFilter
|
||||||
|
|
|
@ -42,8 +42,8 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
* <analyzer>
|
* <analyzer>
|
||||||
* <tokenizer class="solr.JapaneseTokenizerFactory"
|
* <tokenizer class="solr.JapaneseTokenizerFactory"
|
||||||
* mode=NORMAL
|
* mode=NORMAL
|
||||||
* user-dictionary=user.txt
|
* userDictionary=user.txt
|
||||||
* user-dictionary-encoding=UTF-8
|
* userDictionaryEncoding=UTF-8
|
||||||
* />
|
* />
|
||||||
* <filter class="solr.JapaneseBaseFormFilterFactory"/>
|
* <filter class="solr.JapaneseBaseFormFilterFactory"/>
|
||||||
* </analyzer>
|
* </analyzer>
|
||||||
|
@ -53,9 +53,9 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
public class JapaneseTokenizerFactory extends BaseTokenizerFactory implements ResourceLoaderAware {
|
public class JapaneseTokenizerFactory extends BaseTokenizerFactory implements ResourceLoaderAware {
|
||||||
private static final String MODE = "mode";
|
private static final String MODE = "mode";
|
||||||
|
|
||||||
private static final String USER_DICT_PATH = "user-dictionary";
|
private static final String USER_DICT_PATH = "userDictionary";
|
||||||
|
|
||||||
private static final String USER_DICT_ENCODING = "user-dictionary-encoding";
|
private static final String USER_DICT_ENCODING = "userDictionaryEncoding";
|
||||||
|
|
||||||
private UserDictionary userDictionary;
|
private UserDictionary userDictionary;
|
||||||
private Mode mode;
|
private Mode mode;
|
||||||
|
|
|
@ -78,7 +78,7 @@ public class TestJapaneseTokenizerFactory extends BaseTokenTestCase {
|
||||||
"朝青龍,朝青龍,アサショウリュウ,カスタム人名\n";
|
"朝青龍,朝青龍,アサショウリュウ,カスタム人名\n";
|
||||||
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
|
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
|
||||||
Map<String,String> args = new HashMap<String,String>();
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
args.put("user-dictionary", "userdict.txt");
|
args.put("userDictionary", "userdict.txt");
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(new StringMockSolrResourceLoader(userDict));
|
factory.inform(new StringMockSolrResourceLoader(userDict));
|
||||||
TokenStream ts = factory.create(new StringReader("関西国際空港に行った"));
|
TokenStream ts = factory.create(new StringReader("関西国際空港に行った"));
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#
|
#
|
||||||
# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
|
# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
|
||||||
#
|
#
|
||||||
# Any token with a part-of-speech tag that exactly matches those defined in this
|
# Any token with a part-of-speech tag that exactly matches those defined in this
|
||||||
# file are removed from the token stream.
|
# file are removed from the token stream.
|
||||||
|
@ -417,4 +417,4 @@
|
||||||
# unknown: unknown part of speech.
|
# unknown: unknown part of speech.
|
||||||
#未知語
|
#未知語
|
||||||
#
|
#
|
||||||
##### End of file
|
##### End of file
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
# for frequency lists, etc. that can be useful for making your own set (if desired)
|
# for frequency lists, etc. that can be useful for making your own set (if desired)
|
||||||
#
|
#
|
||||||
# Note that there is an overlap between these stopwords and the terms stopped when used
|
# Note that there is an overlap between these stopwords and the terms stopped when used
|
||||||
# in combination with the KuromojiPartOfSpeechStopFilter. When editing this file, note
|
# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note
|
||||||
# that comments are not allowed on the same line as stopwords.
|
# that comments are not allowed on the same line as stopwords.
|
||||||
#
|
#
|
||||||
# Also note that stopping is done in a case-insensitive manner. Change your StopFilter
|
# Also note that stopping is done in a case-insensitive manner. Change your StopFilter
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
#
|
||||||
|
# This is a sample user dictionary for Kuromoji (JapaneseTokenizer)
|
||||||
|
#
|
||||||
|
# Add entries to this file in order to override the statistical model in terms
|
||||||
|
# of segmentation, readings and part-of-speech tags. Notice that entries do
|
||||||
|
# not have weights since they are always used when found. This is by-design
|
||||||
|
# in order to maximize ease-of-use.
|
||||||
|
#
|
||||||
|
# Entries are defined using the following CSV format:
|
||||||
|
# <text>,<token 1> ... <token n>,<reading 1> ... <reading n>,<part-of-speech tag>
|
||||||
|
#
|
||||||
|
# Notice that a single half-width space separates tokens and readings, and
|
||||||
|
# that the number tokens and readings must match exactly.
|
||||||
|
#
|
||||||
|
# Also notice that multiple entries with the same <text> is undefined.
|
||||||
|
#
|
||||||
|
# Whitespace only lines are ignored. Comments are not allowed on entry lines.
|
||||||
|
#
|
||||||
|
|
||||||
|
# Custom segmentation for kanji compounds
|
||||||
|
日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
|
||||||
|
関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞
|
||||||
|
|
||||||
|
# Custom segmentation for compound katakana
|
||||||
|
トートバッグ,トート バッグ,トート バッグ,かずカナ名詞
|
||||||
|
ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞
|
||||||
|
|
||||||
|
# Custom reading for former sumo wrestler
|
||||||
|
朝青龍,朝青龍,アサショウリュウ,カスタム人名
|
|
@ -709,24 +709,35 @@
|
||||||
-->
|
-->
|
||||||
<fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
|
<fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<!-- Kuromoji Japanese morphological analyzer/tokenizer.
|
<!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer)
|
||||||
|
|
||||||
Uses a search-mode (heuristic) to get a noun-decompounding effect that is useful for search.
|
Kuromoji has a search mode (default) that does segmentation useful for search. A heuristic
|
||||||
|
is used to segment compounds into its parts and the compound itself is kept as synonym.
|
||||||
Example:
|
|
||||||
関西国際空港 (Kansai International Airpart) becomes 関西 (Kansai) 国際 (International) 空港 (airport)
|
|
||||||
so we get a match when searching for 空港 (airport) as we would expect from a good search engine.
|
|
||||||
(With regular segmentation 関西国際空港 becomes one word and we don't get a hit.)
|
|
||||||
|
|
||||||
Valid values for mode are:
|
Valid values for attribute mode are:
|
||||||
normal: regular segmentation
|
normal: regular segmentation
|
||||||
search: segmentation useful for search with extra splitting (default)
|
search: segmentation useful for search with synonyms compounds (default)
|
||||||
extended: same as search mode, but unigrams unknown words (experimental)
|
extended: same as search mode, but unigrams unknown words (experimental)
|
||||||
|
|
||||||
NOTE: Search-mode improves segmentation for search at the expense of part-of-speech and reading accuracy
|
For some applications it might be good to use search mode for indexing and normal mode for
|
||||||
|
queries to reduce recall and prevent parts of compounds from being matched and highlighted.
|
||||||
|
Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query.
|
||||||
|
|
||||||
|
Kuromoji also has a convenient user dictionary feature that allows overriding the statistical
|
||||||
|
model with your own entries for segmentation, part-of-speech tags and readings without a need
|
||||||
|
to specify weights. Notice that user dictionaries have not been subject to extensive testing.
|
||||||
|
|
||||||
|
User dictionary attributes are:
|
||||||
|
userDictionary: user dictionary filename
|
||||||
|
userDictionaryEncoding: user dictionary encoding (default is UTF-8)
|
||||||
|
|
||||||
|
See lang/userdict_ja.txt for a sample user dictionary file.
|
||||||
|
|
||||||
|
See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
|
||||||
-->
|
-->
|
||||||
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
|
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
|
||||||
<!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
|
<!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>-->
|
||||||
|
<!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
|
||||||
<filter class="solr.JapaneseBaseFormFilterFactory"/>
|
<filter class="solr.JapaneseBaseFormFilterFactory"/>
|
||||||
<!-- Removes tokens with certain part-of-speech tags -->
|
<!-- Removes tokens with certain part-of-speech tags -->
|
||||||
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/>
|
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/>
|
||||||
|
|
Loading…
Reference in New Issue