SOLR-1336: Add support for Lucene's SmartChineseAnalyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1030073 13f79535-47bb-0310-9956-ffa450edef68
2025-02-15 14:35:50 +00:00 · 2010-11-02 14:51:38 +00:00 · 2010-11-02 14:51:38 +00:00 · fcbcce7b79
commit fcbcce7b79
parent aede41709e
4 changed files with 130 additions and 0 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -299,6 +299,9 @@ New Features

 * SOLR-2210: Add icu-based tokenizer and filters to contrib/analysis-extras (rmuir)

+* SOLR-1336: Add SmartChinese (word segmentation for Simplified Chinese) 
+  tokenizer and filters to contrib/analysis-extras (rmuir)
+
 Optimizations
 ----------------------

--- a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java
+++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java
@ -0,0 +1,33 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
+
+/**
+ * Factory for the SmartChineseAnalyzer {@link SentenceTokenizer}
+ * @lucene.experimental
+ */
+public class SmartChineseSentenceTokenizerFactory extends BaseTokenizerFactory {
+  public Tokenizer create(Reader input) {
+    return new SentenceTokenizer(input);
+  }
+}
--- a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java
+++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java
@ -0,0 +1,37 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
+
+/**
+ * Factory for the SmartChineseAnalyzer {@link WordTokenFilter}
+ * <p>
+ * Note: this class will currently emit tokens for punctuation. So you should either add
+ * a WordDelimiterFilter after to remove these (with concatenate off), or use the 
+ * SmartChinese stoplist with a StopFilterFactory via:
+ * <code>words="org/apache/lucene/analysis/cn/smart/stopwords.txt"</code>
+ * @lucene.experimental
+ */
+public class SmartChineseWordTokenFilterFactory extends BaseTokenFilterFactory {
+  public TokenFilter create(TokenStream input) {
+      return new WordTokenFilter(input);
+  }
+}
--- a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java
+++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java
@ -0,0 +1,57 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/** 
+ * Tests for {@link SmartChineseSentenceTokenizerFactory} and 
+ * {@link SmartCHineseWordTokenFilterFactory}
+ */
+public class TestSmartChineseFactories extends BaseTokenTestCase {
+  /** Test showing the behavior with whitespace */
+  public void testSimple() throws Exception {
+    String sentence = "我购买了道具和服装。";
+    WhitespaceTokenizer ws = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(sentence));
+    SmartChineseWordTokenFilterFactory factory = new SmartChineseWordTokenFilterFactory();
+    TokenStream ts = factory.create(ws);
+    // TODO: fix smart chinese to not emit punctuation tokens
+    // at the moment: you have to clean up with WDF, or use the stoplist, etc
+    assertTokenStreamContents(ts, 
+       new String[] { "我", "购买", "了", "道具", "和", "服装", "," });
+  }
+  
+  /** Test showing the behavior with whitespace */
+  public void testTokenizer() throws Exception {
+    String sentence = "我购买了道具和服装。我购买了道具和服装。";
+    SmartChineseSentenceTokenizerFactory tokenizerFactory = new SmartChineseSentenceTokenizerFactory();
+    Tokenizer tokenizer = tokenizerFactory.create(new StringReader(sentence));
+    SmartChineseWordTokenFilterFactory factory = new SmartChineseWordTokenFilterFactory();
+    TokenStream ts = factory.create(tokenizer);
+    // TODO: fix smart chinese to not emit punctuation tokens
+    // at the moment: you have to clean up with WDF, or use the stoplist, etc
+    assertTokenStreamContents(ts, 
+       new String[] { "我", "购买", "了", "道具", "和", "服装", ",", 
+        "我", "购买", "了", "道具", "和", "服装", ","
+        });
+  }
+}