SOLR-2211: add UAX29TokenizerFactory

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1032776 13f79535-47bb-0310-9956-ffa450edef68
2010-11-08 22:40:56 +00:00 · 2010-11-08 22:40:56 +00:00 · 2e730465b2
parent ccc4b55bec
commit 2e730465b2
3 changed files with 127 additions and 0 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -302,6 +302,9 @@ New Features
 * SOLR-1336: Add SmartChinese (word segmentation for Simplified Chinese) 
  tokenizer and filters to contrib/analysis-extras (rmuir)

+* SOLR-2211: Added UAX29TokenizerFactory, which implements UAX#29, a unicode algorithm 
+  with good results for most languages.  (Tom Burton-West via rmuir)
+
 Optimizations
 ----------------------

--- a/solr/src/java/org/apache/solr/analysis/UAX29TokenizerFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/UAX29TokenizerFactory.java
@ -0,0 +1,43 @@
+package org.apache.solr.analysis;
+
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+import org.apache.lucene.analysis.standard.UAX29Tokenizer;
+
+import java.io.Reader;
+import java.util.Map;
+
+/**
+ * @version $Id$
+ * 
+ */
+
+public class UAX29TokenizerFactory extends BaseTokenizerFactory {
+  @Override
+  public void init(Map<String,String> args) {
+    super.init(args);
+    assureMatchVersion();
+  }
+
+  public UAX29Tokenizer create(Reader input) {
+    return new UAX29Tokenizer(input);
+  }
+}
--- a/solr/src/test/org/apache/solr/analysis/TestUAX29TokenizerFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestUAX29TokenizerFactory.java
@ -0,0 +1,81 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * A few tests based on  org.apache.lucene.analysis.TestUAX29Tokenizer;
+ */
+
+public class TestUAX29TokenizerFactory extends BaseTokenTestCase {
+  /**
+   * Test UAX29TokenizerFactory
+   */
+  public void testUAX29Tokenizer() throws Exception {
+    Reader reader = new StringReader("Wha\u0301t's this thing do?");
+    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] {"Wha\u0301t's", "this", "thing", "do" });
+  }
+  
+  public void testArabic() throws Exception {
+    Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
+    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
+        "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008"  });
+  }
+  
+  public void testChinese() throws Exception {
+    Reader reader = new StringReader("我是中国人。 １２３４ Ｔｅｓｔｓ ");
+    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] {"我", "是", "中", "国", "人", "１２３４", "Ｔｅｓｔｓ"});
+  }
+  public void testKorean() throws Exception {
+    Reader reader = new StringReader("안녕하세요 한글입니다");
+    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] {"안녕하세요", "한글입니다"});
+  }
+    
+  public void testHyphen() throws Exception {
+    Reader reader = new StringReader("some-dashed-phrase");
+    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] {"some", "dashed", "phrase"});
+  }
+
+}
+    
+  
+  
+