Made discarding punctuation configurable in JapaneseTokenizerFactory (SOLR-3524)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1360592 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christian Moen 2012-07-12 09:19:02 +00:00
parent 1b7637d144
commit 8b97cec044
4 changed files with 43 additions and 6 deletions

View File

@ -78,6 +78,10 @@ Bug Fixes
Other Changes
* SOLR-3524: Make discarding punctuation configurable in JapaneseTokenizerFactory.
The default is to discard punctuation, but this is overridable as an expert option.
(Kazuaki Hiraga, Jun Ohtani via Christian Moen)
* SOLR-1770: Move the default core instance directory into a collection1 folder.
(Mark Miller)

View File

@ -42,9 +42,10 @@ import org.apache.lucene.analysis.util.ResourceLoaderAware;
* <fieldType name="text_ja" class="solr.TextField">
* <analyzer>
* <tokenizer class="solr.JapaneseTokenizerFactory"
* mode=NORMAL
* userDictionary=user.txt
* userDictionaryEncoding=UTF-8
* mode="NORMAL"
* userDictionary="user.txt"
* userDictionaryEncoding="UTF-8"
* discardPunctuation="true"
* />
* <filter class="solr.JapaneseBaseFormFilterFactory"/>
* </analyzer>
@ -58,9 +59,14 @@ public class JapaneseTokenizerFactory extends TokenizerFactory implements Resour
private static final String USER_DICT_ENCODING = "userDictionaryEncoding";
private static final String DISCARD_PUNCTUATION = "discardPunctuation"; // Expert option
private UserDictionary userDictionary;
private Mode mode;
private boolean discardPunctuation;
@Override
public void inform(ResourceLoader loader) {
mode = getMode(args);
@ -83,11 +89,12 @@ public class JapaneseTokenizerFactory extends TokenizerFactory implements Resour
} catch (Exception e) {
throw new InitializationException("Exception thrown while loading dictionary", e);
}
discardPunctuation = getBoolean(DISCARD_PUNCTUATION, true);
}
@Override
public Tokenizer create(Reader input) {
return new JapaneseTokenizer(input, userDictionary, true, mode);
return new JapaneseTokenizer(input, userDictionary, discardPunctuation, mode);
}
private Mode getMode(Map<String, String> args) {

View File

@ -74,7 +74,10 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
new String[] { "シニアソフトウェアエンジニア" }
);
}
/**
* Test user dictionary
*/
public void testUserDict() throws IOException {
String userDict =
"# Custom segmentation for long entries\n" +
@ -92,4 +95,25 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
new String[] { "関西", "国際", "空港", "", "行っ", "" }
);
}
/**
* Test preserving punctuation
*/
public void testPreservePunctuation() throws IOException {
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("discardPunctuation", "false");
factory.init(args);
factory.inform(new SolrResourceLoader(null, null));
TokenStream ts = factory.create(
new StringReader("今ノルウェーにいますが、来週の頭日本に戻ります。楽しみにしています!お寿司が食べたいな。。。")
);
System.out.println(ts.toString());
assertTokenStreamContents(ts,
new String[] { "", "ノルウェー", "", "", "ます", "", "",
"来週", "", "", "日本", "", "戻り", "ます", "",
"楽しみ", "", "", "", "", "ます", "",
"", "寿司", "", "食べ", "たい", "", "", "", ""}
);
}
}

View File

@ -923,6 +923,8 @@
See lang/userdict_ja.txt for a sample user dictionary file.
Punctuation characters are discarded by default. Use discardPunctuation="false" to keep them.
See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
-->
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>