SOLR-2211: add UAX29TokenizerFactory

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1032776 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-11-08 22:40:56 +00:00
parent ccc4b55bec
commit 2e730465b2
3 changed files with 127 additions and 0 deletions

View File

@ -302,6 +302,9 @@ New Features
* SOLR-1336: Add SmartChinese (word segmentation for Simplified Chinese)
tokenizer and filters to contrib/analysis-extras (rmuir)
* SOLR-2211: Added UAX29TokenizerFactory, which implements UAX#29, a unicode algorithm
with good results for most languages. (Tom Burton-West via rmuir)
Optimizations
----------------------

View File

@ -0,0 +1,43 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.standard.UAX29Tokenizer;
import java.io.Reader;
import java.util.Map;
/**
* @version $Id$
*
*/
public class UAX29TokenizerFactory extends BaseTokenizerFactory {
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public UAX29Tokenizer create(Reader input) {
return new UAX29Tokenizer(input);
}
}

View File

@ -0,0 +1,81 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Tokenizer;
/**
* A few tests based on org.apache.lucene.analysis.TestUAX29Tokenizer;
*/
public class TestUAX29TokenizerFactory extends BaseTokenTestCase {
/**
* Test UAX29TokenizerFactory
*/
public void testUAX29Tokenizer() throws Exception {
Reader reader = new StringReader("Wha\u0301t's this thing do?");
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"Wha\u0301t's", "this", "thing", "do" });
}
public void testArabic() throws Exception {
Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" });
}
public void testChinese() throws Exception {
Reader reader = new StringReader("我是中国人。 ");
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"", "", "", "", "", "", ""});
}
public void testKorean() throws Exception {
Reader reader = new StringReader("안녕하세요 한글입니다");
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"안녕하세요", "한글입니다"});
}
public void testHyphen() throws Exception {
Reader reader = new StringReader("some-dashed-phrase");
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"some", "dashed", "phrase"});
}
}