mirror of https://github.com/apache/lucene.git
Fixed various related to config and user dictionaries for Kuromoji (SOLR-3276)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1306476 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0ced8b5020
commit
ec18632428
|
@ -99,8 +99,9 @@ public final class UserDictionary implements Dictionary {
|
|||
String pos = values[3];
|
||||
|
||||
if (segmentation.length != readings.length) {
|
||||
// FIXME: Should probably deal with this differently. Exception?
|
||||
System.out.println("This entry is not properly formatted : " + line);
|
||||
throw new RuntimeException("Illegal user dictionary entry " + values[0] +
|
||||
" - the number of segmentations (" + segmentation.length + ")" +
|
||||
" does not the match number of readings (" + readings.length + ")");
|
||||
}
|
||||
|
||||
int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
|
||||
# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
|
||||
#
|
||||
# Any token with a part-of-speech tag that exactly matches those defined in this
|
||||
# file are removed from the token stream.
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
# for frequency lists, etc. that can be useful for making your own set (if desired)
|
||||
#
|
||||
# Note that there is an overlap between these stopwords and the terms stopped when used
|
||||
# in combination with the KuromojiPartOfSpeechStopFilter. When editing this file, note
|
||||
# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note
|
||||
# that comments are not allowed on the same line as stopwords.
|
||||
#
|
||||
# Also note that stopping is done in a case-insensitive manner. Change your StopFilter
|
||||
|
|
|
@ -42,8 +42,8 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
|
|||
* <analyzer>
|
||||
* <tokenizer class="solr.JapaneseTokenizerFactory"
|
||||
* mode=NORMAL
|
||||
* user-dictionary=user.txt
|
||||
* user-dictionary-encoding=UTF-8
|
||||
* userDictionary=user.txt
|
||||
* userDictionaryEncoding=UTF-8
|
||||
* />
|
||||
* <filter class="solr.JapaneseBaseFormFilterFactory"/>
|
||||
* </analyzer>
|
||||
|
@ -53,9 +53,9 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
|
|||
public class JapaneseTokenizerFactory extends BaseTokenizerFactory implements ResourceLoaderAware {
|
||||
private static final String MODE = "mode";
|
||||
|
||||
private static final String USER_DICT_PATH = "user-dictionary";
|
||||
private static final String USER_DICT_PATH = "userDictionary";
|
||||
|
||||
private static final String USER_DICT_ENCODING = "user-dictionary-encoding";
|
||||
private static final String USER_DICT_ENCODING = "userDictionaryEncoding";
|
||||
|
||||
private UserDictionary userDictionary;
|
||||
private Mode mode;
|
||||
|
|
|
@ -78,7 +78,7 @@ public class TestJapaneseTokenizerFactory extends BaseTokenTestCase {
|
|||
"朝青龍,朝青龍,アサショウリュウ,カスタム人名\n";
|
||||
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("user-dictionary", "userdict.txt");
|
||||
args.put("userDictionary", "userdict.txt");
|
||||
factory.init(args);
|
||||
factory.inform(new StringMockSolrResourceLoader(userDict));
|
||||
TokenStream ts = factory.create(new StringReader("関西国際空港に行った"));
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# This file defines a Japanese stoptag set for KuromojiPartOfSpeechStopFilter.
|
||||
# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
|
||||
#
|
||||
# Any token with a part-of-speech tag that exactly matches those defined in this
|
||||
# file are removed from the token stream.
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
# for frequency lists, etc. that can be useful for making your own set (if desired)
|
||||
#
|
||||
# Note that there is an overlap between these stopwords and the terms stopped when used
|
||||
# in combination with the KuromojiPartOfSpeechStopFilter. When editing this file, note
|
||||
# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note
|
||||
# that comments are not allowed on the same line as stopwords.
|
||||
#
|
||||
# Also note that stopping is done in a case-insensitive manner. Change your StopFilter
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
#
|
||||
# This is a sample user dictionary for Kuromoji (JapaneseTokenizer)
|
||||
#
|
||||
# Add entries to this file in order to override the statistical model in terms
|
||||
# of segmentation, readings and part-of-speech tags. Notice that entries do
|
||||
# not have weights since they are always used when found. This is by-design
|
||||
# in order to maximize ease-of-use.
|
||||
#
|
||||
# Entries are defined using the following CSV format:
|
||||
# <text>,<token 1> ... <token n>,<reading 1> ... <reading n>,<part-of-speech tag>
|
||||
#
|
||||
# Notice that a single half-width space separates tokens and readings, and
|
||||
# that the number tokens and readings must match exactly.
|
||||
#
|
||||
# Also notice that multiple entries with the same <text> is undefined.
|
||||
#
|
||||
# Whitespace only lines are ignored. Comments are not allowed on entry lines.
|
||||
#
|
||||
|
||||
# Custom segmentation for kanji compounds
|
||||
日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
|
||||
関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞
|
||||
|
||||
# Custom segmentation for compound katakana
|
||||
トートバッグ,トート バッグ,トート バッグ,かずカナ名詞
|
||||
ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞
|
||||
|
||||
# Custom reading for former sumo wrestler
|
||||
朝青龍,朝青龍,アサショウリュウ,カスタム人名
|
|
@ -709,23 +709,34 @@
|
|||
-->
|
||||
<fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
|
||||
<analyzer>
|
||||
<!-- Kuromoji Japanese morphological analyzer/tokenizer.
|
||||
<!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer)
|
||||
|
||||
Uses a search-mode (heuristic) to get a noun-decompounding effect that is useful for search.
|
||||
Kuromoji has a search mode (default) that does segmentation useful for search. A heuristic
|
||||
is used to segment compounds into its parts and the compound itself is kept as synonym.
|
||||
|
||||
Example:
|
||||
関西国際空港 (Kansai International Airpart) becomes 関西 (Kansai) 国際 (International) 空港 (airport)
|
||||
so we get a match when searching for 空港 (airport) as we would expect from a good search engine.
|
||||
(With regular segmentation 関西国際空港 becomes one word and we don't get a hit.)
|
||||
|
||||
Valid values for mode are:
|
||||
Valid values for attribute mode are:
|
||||
normal: regular segmentation
|
||||
search: segmentation useful for search with extra splitting (default)
|
||||
search: segmentation useful for search with synonyms compounds (default)
|
||||
extended: same as search mode, but unigrams unknown words (experimental)
|
||||
|
||||
NOTE: Search-mode improves segmentation for search at the expense of part-of-speech and reading accuracy
|
||||
For some applications it might be good to use search mode for indexing and normal mode for
|
||||
queries to reduce recall and prevent parts of compounds from being matched and highlighted.
|
||||
Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query.
|
||||
|
||||
Kuromoji also has a convenient user dictionary feature that allows overriding the statistical
|
||||
model with your own entries for segmentation, part-of-speech tags and readings without a need
|
||||
to specify weights. Notice that user dictionaries have not been subject to extensive testing.
|
||||
|
||||
User dictionary attributes are:
|
||||
userDictionary: user dictionary filename
|
||||
userDictionaryEncoding: user dictionary encoding (default is UTF-8)
|
||||
|
||||
See lang/userdict_ja.txt for a sample user dictionary file.
|
||||
|
||||
See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
|
||||
-->
|
||||
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
|
||||
<!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>-->
|
||||
<!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
|
||||
<filter class="solr.JapaneseBaseFormFilterFactory"/>
|
||||
<!-- Removes tokens with certain part-of-speech tags -->
|
||||
|
|
Loading…
Reference in New Issue