diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 62272aaa2f9..a9d9e9bf7f4 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -78,6 +78,10 @@ Bug Fixes Other Changes +* SOLR-3524: Make discarding punctuation configurable in JapaneseTokenizerFactory. + The default is to discard punctuation, but this is overridable as an expert option. + (Kazuaki Hiraga, Jun Ohtani via Christian Moen) + * SOLR-1770: Move the default core instance directory into a collection1 folder. (Mark Miller) diff --git a/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java b/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java index bbc66d96423..769d30490f7 100644 --- a/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java @@ -42,9 +42,10 @@ import org.apache.lucene.analysis.util.ResourceLoaderAware; * <fieldType name="text_ja" class="solr.TextField"> * <analyzer> * <tokenizer class="solr.JapaneseTokenizerFactory" - * mode=NORMAL - * userDictionary=user.txt - * userDictionaryEncoding=UTF-8 + * mode="NORMAL" + * userDictionary="user.txt" + * userDictionaryEncoding="UTF-8" + * discardPunctuation="true" * /> * <filter class="solr.JapaneseBaseFormFilterFactory"/> * </analyzer> @@ -58,9 +59,14 @@ public class JapaneseTokenizerFactory extends TokenizerFactory implements Resour private static final String USER_DICT_ENCODING = "userDictionaryEncoding"; + private static final String DISCARD_PUNCTUATION = "discardPunctuation"; // Expert option + private UserDictionary userDictionary; + private Mode mode; - + + private boolean discardPunctuation; + @Override public void inform(ResourceLoader loader) { mode = getMode(args); @@ -83,11 +89,12 @@ public class JapaneseTokenizerFactory extends TokenizerFactory implements Resour } catch (Exception e) { throw new InitializationException("Exception thrown while loading dictionary", e); } + discardPunctuation = getBoolean(DISCARD_PUNCTUATION, true); } @Override public Tokenizer create(Reader input) { - return new JapaneseTokenizer(input, userDictionary, true, mode); + return new JapaneseTokenizer(input, userDictionary, discardPunctuation, mode); } private Mode getMode(Map args) { diff --git a/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java b/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java index c4bb8113b37..ae6b40b04ea 100644 --- a/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java +++ b/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java @@ -74,7 +74,10 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase { new String[] { "シニアソフトウェアエンジニア" } ); } - + + /** + * Test user dictionary + */ public void testUserDict() throws IOException { String userDict = "# Custom segmentation for long entries\n" + @@ -92,4 +95,25 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase { new String[] { "関西", "国際", "空港", "に", "行っ", "た" } ); } + + /** + * Test preserving punctuation + */ + public void testPreservePunctuation() throws IOException { + JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(); + Map args = new HashMap(); + args.put("discardPunctuation", "false"); + factory.init(args); + factory.inform(new SolrResourceLoader(null, null)); + TokenStream ts = factory.create( + new StringReader("今ノルウェーにいますが、来週の頭日本に戻ります。楽しみにしています!お寿司が食べたいな。。。") + ); + System.out.println(ts.toString()); + assertTokenStreamContents(ts, + new String[] { "今", "ノルウェー", "に", "い", "ます", "が", "、", + "来週", "の", "頭", "日本", "に", "戻り", "ます", "。", + "楽しみ", "に", "し", "て", "い", "ます", "!", + "お", "寿司", "が", "食べ", "たい", "な", "。", "。", "。"} + ); + } } diff --git a/solr/example/solr/collection1/conf/schema.xml b/solr/example/solr/collection1/conf/schema.xml index 644d769bb94..acb4f1ea4e3 100755 --- a/solr/example/solr/collection1/conf/schema.xml +++ b/solr/example/solr/collection1/conf/schema.xml @@ -923,6 +923,8 @@ See lang/userdict_ja.txt for a sample user dictionary file. + Punctuation characters are discarded by default. Use discardPunctuation="false" to keep them. + See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support. -->