mirror of https://github.com/apache/lucene.git
Made discarding punctuation configurable in JapaneseTokenizerFactory (SOLR-3524)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1360592 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1b7637d144
commit
8b97cec044
|
@ -78,6 +78,10 @@ Bug Fixes
|
|||
|
||||
Other Changes
|
||||
|
||||
* SOLR-3524: Make discarding punctuation configurable in JapaneseTokenizerFactory.
|
||||
The default is to discard punctuation, but this is overridable as an expert option.
|
||||
(Kazuaki Hiraga, Jun Ohtani via Christian Moen)
|
||||
|
||||
* SOLR-1770: Move the default core instance directory into a collection1 folder.
|
||||
(Mark Miller)
|
||||
|
||||
|
|
|
@ -42,9 +42,10 @@ import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
|||
* <fieldType name="text_ja" class="solr.TextField">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.JapaneseTokenizerFactory"
|
||||
* mode=NORMAL
|
||||
* userDictionary=user.txt
|
||||
* userDictionaryEncoding=UTF-8
|
||||
* mode="NORMAL"
|
||||
* userDictionary="user.txt"
|
||||
* userDictionaryEncoding="UTF-8"
|
||||
* discardPunctuation="true"
|
||||
* />
|
||||
* <filter class="solr.JapaneseBaseFormFilterFactory"/>
|
||||
* </analyzer>
|
||||
|
@ -58,9 +59,14 @@ public class JapaneseTokenizerFactory extends TokenizerFactory implements Resour
|
|||
|
||||
private static final String USER_DICT_ENCODING = "userDictionaryEncoding";
|
||||
|
||||
private static final String DISCARD_PUNCTUATION = "discardPunctuation"; // Expert option
|
||||
|
||||
private UserDictionary userDictionary;
|
||||
|
||||
private Mode mode;
|
||||
|
||||
|
||||
private boolean discardPunctuation;
|
||||
|
||||
@Override
|
||||
public void inform(ResourceLoader loader) {
|
||||
mode = getMode(args);
|
||||
|
@ -83,11 +89,12 @@ public class JapaneseTokenizerFactory extends TokenizerFactory implements Resour
|
|||
} catch (Exception e) {
|
||||
throw new InitializationException("Exception thrown while loading dictionary", e);
|
||||
}
|
||||
discardPunctuation = getBoolean(DISCARD_PUNCTUATION, true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create(Reader input) {
|
||||
return new JapaneseTokenizer(input, userDictionary, true, mode);
|
||||
return new JapaneseTokenizer(input, userDictionary, discardPunctuation, mode);
|
||||
}
|
||||
|
||||
private Mode getMode(Map<String, String> args) {
|
||||
|
|
|
@ -74,7 +74,10 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
|
|||
new String[] { "シニアソフトウェアエンジニア" }
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Test user dictionary
|
||||
*/
|
||||
public void testUserDict() throws IOException {
|
||||
String userDict =
|
||||
"# Custom segmentation for long entries\n" +
|
||||
|
@ -92,4 +95,25 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
|
|||
new String[] { "関西", "国際", "空港", "に", "行っ", "た" }
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test preserving punctuation
|
||||
*/
|
||||
public void testPreservePunctuation() throws IOException {
|
||||
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("discardPunctuation", "false");
|
||||
factory.init(args);
|
||||
factory.inform(new SolrResourceLoader(null, null));
|
||||
TokenStream ts = factory.create(
|
||||
new StringReader("今ノルウェーにいますが、来週の頭日本に戻ります。楽しみにしています!お寿司が食べたいな。。。")
|
||||
);
|
||||
System.out.println(ts.toString());
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "今", "ノルウェー", "に", "い", "ます", "が", "、",
|
||||
"来週", "の", "頭", "日本", "に", "戻り", "ます", "。",
|
||||
"楽しみ", "に", "し", "て", "い", "ます", "!",
|
||||
"お", "寿司", "が", "食べ", "たい", "な", "。", "。", "。"}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -923,6 +923,8 @@
|
|||
|
||||
See lang/userdict_ja.txt for a sample user dictionary file.
|
||||
|
||||
Punctuation characters are discarded by default. Use discardPunctuation="false" to keep them.
|
||||
|
||||
See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
|
||||
-->
|
||||
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
|
||||
|
|
Loading…
Reference in New Issue