SOLR-2210: add factories for icu analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1030012 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-11-02 12:03:18 +00:00
parent f4a564065b
commit 5c6b4f4f65
16 changed files with 936 additions and 4 deletions

View File

@ -297,6 +297,8 @@ New Features
built-in load balancing, and infrastructure for future SolrCloud work. built-in load balancing, and infrastructure for future SolrCloud work.
(yonik, Mark Miller) (yonik, Mark Miller)
* SOLR-2210: Add icu-based tokenizer and filters to contrib/analysis-extras (rmuir)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -34,9 +34,6 @@
<property name="clover.db.dir" location="${dest}/tests/clover/db"/> <property name="clover.db.dir" location="${dest}/tests/clover/db"/>
<property name="clover.report.dir" location="${dest}/tests/clover/reports"/> <property name="clover.report.dir" location="${dest}/tests/clover/reports"/>
<!-- change this together with the default and test's solrconfig.xml after starting a new development branch: -->
<property name="tests.luceneMatchVersion" value="4.0"/>
<available <available
property="clover.present" property="clover.present"
classname="com.cenqua.clover.tasks.CloverReportTask" classname="com.cenqua.clover.tasks.CloverReportTask"
@ -221,6 +218,7 @@
<packageset dir="contrib/dataimporthandler/src/main/java" /> <packageset dir="contrib/dataimporthandler/src/main/java" />
<!--<packageset dir="contrib/clustering/src/main/java" />--> <!--<packageset dir="contrib/clustering/src/main/java" />-->
<packageset dir="contrib/extraction/src/main/java" /> <packageset dir="contrib/extraction/src/main/java" />
<packageset dir="contrib/analysis-extras/src/java" />
<group title="Core" packages="org.apache.*" /> <group title="Core" packages="org.apache.*" />
<group title="Common" packages="org.apache.solr.common.*" /> <group title="Common" packages="org.apache.solr.common.*" />
<group title="SolrJ" packages="org.apache.solr.client.solrj*" /> <group title="SolrJ" packages="org.apache.solr.client.solrj*" />
@ -509,6 +507,7 @@
<fileset dir="contrib/dataimporthandler/src/main/java" /> <fileset dir="contrib/dataimporthandler/src/main/java" />
<fileset dir="contrib/clustering/src/main/java" /> <fileset dir="contrib/clustering/src/main/java" />
<fileset dir="contrib/extraction/src/main/java" /> <fileset dir="contrib/extraction/src/main/java" />
<fileset dir="contrib/analysis-extras/src/java" />
</clover-setup> </clover-setup>
</target> </target>
@ -609,6 +608,8 @@
basedir="contrib/extraction/src" /> basedir="contrib/extraction/src" />
<!--<solr-jar destfile="${dist}/apache-solr-clustering-src-${version}.jar" <!--<solr-jar destfile="${dist}/apache-solr-clustering-src-${version}.jar"
basedir="contrib/clustering/src" />--> basedir="contrib/clustering/src" />-->
<solr-jar destfile="${dist}/apache-solr-analysis-extras-src-${version}.jar"
basedir="contrib/analysis-extras/src" />
</target> </target>
<target name="dist-javadoc" description="Creates the Solr javadoc distribution files" <target name="dist-javadoc" description="Creates the Solr javadoc distribution files"
@ -625,6 +626,8 @@
basedir="${build.javadoc}/contrib-solr-clustering" />--> basedir="${build.javadoc}/contrib-solr-clustering" />-->
<solr-jar destfile="${dist}/apache-solr-cell-docs-${version}.jar" <solr-jar destfile="${dist}/apache-solr-cell-docs-${version}.jar"
basedir="${build.javadoc}/contrib-solr-cell" /> basedir="${build.javadoc}/contrib-solr-cell" />
<solr-jar destfile="${dist}/apache-solr-analysis-extras-docs-${version}.jar"
basedir="${build.javadoc}/contrib-solr-analysis-extras" />
</target> </target>
<!-- Creates the solr jar. --> <!-- Creates the solr jar. -->
@ -721,7 +724,7 @@
<tarfileset dir="." <tarfileset dir="."
prefix="${fullnamever}" prefix="${fullnamever}"
includes="LICENSE.txt NOTICE.txt *.txt *.xml lucene-libs/** lib/** src/** example/** client/** contrib/" includes="LICENSE.txt NOTICE.txt *.txt *.xml lucene-libs/** lib/** src/** example/** client/** contrib/"
excludes="lib/README.committers.txt **/data/ **/logs/* **/classes/ **/*.sh **/bin/ src/scripts/ src/site/build/ **/target/ client/ruby/flare/ client/python contrib/**/build/ **/*.iml **/*.ipr **/*.iws contrib/clustering/example/lib/** contrib/clustering/lib/downloads/**" /> excludes="lib/README.committers.txt **/data/ **/logs/* **/classes/ **/*.sh **/bin/ src/scripts/ src/site/build/ **/target/ client/ruby/flare/ client/python contrib/**/build/ **/*.iml **/*.ipr **/*.iws contrib/clustering/example/lib/** contrib/clustering/lib/downloads/** contrib/analysis-extras/lib/**" />
<tarfileset dir="." <tarfileset dir="."
prefix="${fullnamever}" prefix="${fullnamever}"
includes="src/test/test-files/solr/lib/classes/empty-file-main-lib.txt" /> includes="src/test/test-files/solr/lib/classes/empty-file-main-lib.txt" />
@ -952,6 +955,8 @@
<fileset dir="contrib/clustering/src/test/java"/> <fileset dir="contrib/clustering/src/test/java"/>
<fileset dir="contrib/extraction/src/main/java"/> <fileset dir="contrib/extraction/src/main/java"/>
<fileset dir="contrib/extraction/src/test/java"/> <fileset dir="contrib/extraction/src/test/java"/>
<fileset dir="contrib/analysis-extras/src/test"/>
<fileset dir="contrib/analysis-extras/src/test"/>
</rat:report> </rat:report>
</target> </target>

View File

@ -23,6 +23,9 @@
<dirname file="${ant.file.common-solr}" property="common-solr.dir"/> <dirname file="${ant.file.common-solr}" property="common-solr.dir"/>
<!-- change this together with the default and test's solrconfig.xml after starting a new development branch: -->
<property name="tests.luceneMatchVersion" value="4.0"/>
<!-- Initialize property values: allow easy customization via build.properties --> <!-- Initialize property values: allow easy customization via build.properties -->
<property file="build.properties" /> <property file="build.properties" />

View File

@ -0,0 +1,16 @@
The analysis-extras plugin provides additional analyzers that rely
upon large dependencies/dictionaries.
It includes integration with ICU for multilingual support, and
analyzers for Chinese and Polish.
Relies upon the following lucene components (in lucene-libs/):
* lucene-analyzers-icu-X.Y.jar
* lucene-analyzers-smartcn-X.Y.jar
* lucene-analyzers-stempel-X.Y.jar
And the ICU library (in lib/):
* icu4j-X.Y.jar

View File

@ -0,0 +1,203 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="solr-extraAnalyzers" default="build">
<property name="solr-path" value="../.."/>
<import file="../../common-build.xml"/>
<description>
Additional analysis components
</description>
<property name="example.local" value="example"/>
<!-- support for the additional analyzers modules -->
<path id="modules.classpath">
<pathelement location="${common-solr.dir}/../modules/analysis/build/icu/classes/java" />
<pathelement location="${common-solr.dir}/../modules/analysis/build/smartcn/classes/java" />
<pathelement location="${common-solr.dir}/../modules/analysis/build/stempel/classes/java" />
</path>
<target name="prep-module-jars">
<subant target="jar" inheritall="false" failonerror="true">
<fileset dir="${common-solr.dir}/../modules/analysis/icu" includes="build.xml" />
<fileset dir="${common-solr.dir}/../modules/analysis/smartcn" includes="build.xml" />
<fileset dir="${common-solr.dir}/../modules/analysis/stempel" includes="build.xml" />
</subant>
</target>
<target name="module-jars-to-solr" depends="prep-module-jars">
<mkdir dir="${lucene-libs}"/>
<copy todir="${lucene-libs}" preservelastmodified="true" flatten="true" failonerror="true" overwrite="true">
<fileset dir="${common-solr.dir}/../modules/analysis/build/icu">
<include name="lucene-analyzers-icu-${version}.jar" />
</fileset>
<fileset dir="${common-solr.dir}/../modules/analysis/build/smartcn">
<include name="lucene-analyzers-smartcn-${version}.jar" />
</fileset>
<fileset dir="${common-solr.dir}/../modules/analysis/build/stempel">
<include name="lucene-analyzers-stempel-${version}.jar" />
</fileset>
</copy>
</target>
<path id="common.classpath">
<fileset dir="lib"/>
<pathelement location="${solr-path}/build/solr"/>
<pathelement location="${solr-path}/build/solrj"/>
<path refid="lucene.classpath"/>
<path refid="modules.classpath"/>
<fileset dir="${solr-path}/lib" includes="*.jar"/>
</path>
<path id="test.classpath">
<pathelement path="${dest}/classes"/>
<pathelement path="${dest}/test-classes"/>
<pathelement path="${java.class.path}"/>
<pathelement location="${common-solr.dir}/build/tests"/> <!-- include solr test code -->
<pathelement location="${common-solr.dir}/../lucene/build/classes/test" /> <!-- include some lucene test code -->
<path refid="common.classpath"/>
</path>
<target name="clean">
<delete failonerror="false" dir="${dest}"/>
<!-- example doesn't create this anymore, but clean it up
if it's still there from an old build
-->
<delete dir="example/lib" />
<delete dir="${lucene-libs}" />
</target>
<target name="init" depends="module-jars-to-solr">
<mkdir dir="${dest}/classes"/>
<mkdir dir="${build.javadoc}"/>
<subant target="compileTests">
<fileset dir="${solr-path}" includes="build.xml"/>
</subant>
<subant target="make-manifest">
<fileset dir="${solr-path}" includes="build.xml"/>
</subant>
</target>
<target name="compile" depends="init">
<solr-javac destdir="${dest}/classes"
classpathref="common.classpath">
<src path="src/java"/>
</solr-javac>
</target>
<target name="build" depends="compile">
<solr-jar destfile="${dest}/${fullnamever}.jar" basedir="${dest}/classes"
manifest="../../${dest}/META-INF/MANIFEST.MF"/>
</target>
<target name="compileTests" depends="compile">
<solr-javac destdir="${dest}/test-classes"
classpathref="test.classpath">
<src path="src/test"/>
</solr-javac>
</target>
<target name="example" depends="build,dist">
<!-- this task use to copy lib's but that's no longer needed because
../lib and ../lib/downloads are now included explicitly by
example/conf/solrconfig.xml
-->
</target>
<target name="test" depends="compileTests">
<mkdir dir="${junit.output.dir}"/>
<junit printsummary="no"
haltonfailure="no"
maxmemory="512M"
errorProperty="tests.failed"
failureProperty="tests.failed"
dir="src/test/test-files/"
tempdir="${junit.output.dir}"
forkmode="perBatch"
>
<sysproperty key="java.util.logging.config.file" value="${common-solr.dir}/testlogging.properties"/>
<sysproperty key="tests.luceneMatchVersion" value="${tests.luceneMatchVersion}"/>
<sysproperty key="tests.codec" value="${tests.codec}"/>
<sysproperty key="tests.locale" value="${tests.locale}"/>
<sysproperty key="tests.timezone" value="${tests.timezone}"/>
<sysproperty key="tests.multiplier" value="${tests.multiplier}"/>
<sysproperty key="tests.seed" value="${tests.seed}"/>
<sysproperty key="tests.iter" value="${tests.iter}"/>
<sysproperty key="jetty.testMode" value="1"/>
<sysproperty key="tempDir" file="${junit.output.dir}"/>
<sysproperty key="testmethod" value="${testmethod}"/>
<jvmarg line="${args}"/>
<formatter classname="${junit.details.formatter}" usefile="false" if="junit.details"/>
<classpath refid="test.classpath"/>
<assertions>
<enable package="org.apache.lucene"/>
<enable package="org.apache.solr"/>
</assertions>
<formatter type="${junit.formatter}"/>
<batchtest fork="yes" todir="${junit.output.dir}" unless="testcase">
<fileset dir="src/test" includes="${junit.includes}"/>
</batchtest>
<batchtest fork="yes" todir="${junit.output.dir}" if="testcase">
<fileset dir="src/test" includes="**/${testcase}.java"/>
</batchtest>
</junit>
<fail if="tests.failed">Tests failed!</fail>
</target>
<target name="dist" depends="build">
<!--
<copy file="${dest}/${fullnamever}.jar" todir="${solr-path}/build/web/WEB-INF/lib"/>
<copy todir="${solr-path}/build/web/WEB-INF/lib" flatten="true">
<fileset dir="lib">
<include name="**/*.jar"/>
</fileset>
</copy>
-->
<copy file="${dest}/${fullnamever}.jar" todir="${solr-path}/dist"/>
</target>
<target name="javadoc">
<sequential>
<mkdir dir="${build.javadoc}/contrib-${name}"/>
<path id="javadoc.classpath">
<path refid="common.classpath"/>
</path>
<invoke-javadoc
destdir="${build.javadoc}/contrib-${name}"
title="${Name} ${version} contrib-${fullnamever} API">
<sources>
<packageset dir="src/java"/>
</sources>
</invoke-javadoc>
</sequential>
</target>
</project>

View File

@ -0,0 +1,2 @@
AnyObjectId[4d9d4e1277822f7a08dd9469ae2ca81d44902552] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,142 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.InputStream;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.collation.ICUCollationKeyFilter;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
/**
* Factory for {@link ICUCollationKeyFilter}.
* <p>
* This factory can be created in two ways:
* <ul>
* <li>Based upon a system collator associated with a Locale.
* <li>Based upon a tailored ruleset.
* </ul>
* <p>
* Using a System collator:
* <ul>
* <li>locale: RFC 3066 locale ID (mandatory)
* <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
* <li>decomposition: 'no', or 'canonical' (optional)
* </ul>
* <p>
* Using a Tailored ruleset:
* <ul>
* <li>custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)
* <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
* <li>decomposition: 'no' or 'canonical' (optional)
* </ul>
*
* @see Collator
* @see ULocale
* @see RuleBasedCollator
*/
public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private Collator collator;
public void inform(ResourceLoader loader) {
String custom = args.get("custom");
String localeID = args.get("locale");
String strength = args.get("strength");
String decomposition = args.get("decomposition");
if (custom == null && localeID == null)
throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
if (custom != null && localeID != null)
throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. "
+ "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. "
+ "Then save the entire customized ruleset to a file, and use with the custom parameter");
if (localeID != null) {
// create from a system collator, based on Locale.
collator = createFromLocale(localeID);
} else {
// create from a custom ruleset
collator = createFromRules(custom, loader);
}
// set the strength flag, otherwise it will be the default.
if (strength != null) {
if (strength.equalsIgnoreCase("primary"))
collator.setStrength(Collator.PRIMARY);
else if (strength.equalsIgnoreCase("secondary"))
collator.setStrength(Collator.SECONDARY);
else if (strength.equalsIgnoreCase("tertiary"))
collator.setStrength(Collator.TERTIARY);
else if (strength.equalsIgnoreCase("quaternary"))
collator.setStrength(Collator.QUATERNARY);
else if (strength.equalsIgnoreCase("identical"))
collator.setStrength(Collator.IDENTICAL);
else
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
}
// set the decomposition flag, otherwise it will be the default.
if (decomposition != null) {
if (decomposition.equalsIgnoreCase("no"))
collator.setDecomposition(Collator.NO_DECOMPOSITION);
else if (decomposition.equalsIgnoreCase("canonical"))
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
else
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
}
}
public TokenStream create(TokenStream input) {
return new ICUCollationKeyFilter(input, collator);
}
/*
* Create a locale from localeID.
* Then return the appropriate collator for the locale.
*/
private Collator createFromLocale(String localeID) {
return Collator.getInstance(new ULocale(localeID));
}
/*
* Read custom rules from a file, and create a RuleBasedCollator
* The file cannot support comments, as # might be in the rules!
*/
private Collator createFromRules(String fileName, ResourceLoader loader) {
InputStream input = null;
try {
input = loader.openResource(fileName);
String rules = IOUtils.toString(input, "UTF-8");
return new RuleBasedCollator(rules);
} catch (Exception e) {
// io error or invalid rules
throw new RuntimeException(e);
} finally {
IOUtils.closeQuietly(input);
}
}
}

View File

@ -0,0 +1,30 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.icu.ICUFoldingFilter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Factory for {@link ICUFoldingFilter} */
public class ICUFoldingFilterFactory extends BaseTokenFilterFactory {
@Override
public TokenStream create(TokenStream input) {
return new ICUFoldingFilter(input);
}
}

View File

@ -0,0 +1,81 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import com.ibm.icu.text.FilteredNormalizer2;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.UnicodeSet;
/**
* Factory for {@link ICUNormalizer2Filter}
* <p>
* Supports the following attributes:
* <ul>
* <li>name: A <a href="http://unicode.org/reports/tr15/">Unicode Normalization Form</a>,
* one of 'nfc','nfkc', 'nfkc_cf'. Default is nfkc_cf.
* <li>mode: Either 'compose' or 'decompose'. Default is compose. Use "decompose" with nfc
* or nfkc, to get nfd or nfkd, respectively.
* <li>filter: A {@link UnicodeSet} pattern. Codepoints outside the set are
* always left unchanged. Default is [] (the null set, no filtering).
* </ul>
* @see ICUNormalizer2Filter
* @see Normalizer2
* @see FilteredNormalizer2
*/
public class ICUNormalizer2FilterFactory extends BaseTokenFilterFactory {
private Normalizer2 normalizer;
// TODO: support custom normalization
@Override
public void init(Map<String,String> args) {
super.init(args);
String name = args.get("name");
if (name == null)
name = "nfkc_cf";
String mode = args.get("mode");
if (mode == null)
mode = "compose";
if (mode.equals("compose"))
normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE);
else if (mode.equals("decompose"))
normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.DECOMPOSE);
else
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid mode: " + mode);
String filter = args.get("filter");
if (filter != null) {
UnicodeSet set = new UnicodeSet(filter);
if (!set.isEmpty()) {
set.freeze();
normalizer = new FilteredNormalizer2(normalizer, set);
}
}
}
public TokenStream create(TokenStream input) {
return new ICUNormalizer2Filter(input, normalizer);
}
}

View File

@ -0,0 +1,32 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
/** Factory for {@link ICUTokenizer} */
public class ICUTokenizerFactory extends BaseTokenizerFactory {
// TODO: add support for custom configs
@Override
public Tokenizer create(Reader input) {
return new ICUTokenizer(input);
}
}

View File

@ -0,0 +1,67 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.icu.ICUTransformFilter;
import org.apache.solr.analysis.BaseTokenFilterFactory;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import com.ibm.icu.text.Transliterator;
/**
* Factory for {@link ICUTransformFilter}.
* <p>
* Supports the following attributes:
* <ul>
* <li>id (mandatory): A Transliterator ID, one from {@link Transliterator#getAvailableIDs()}
* <li>direction (optional): Either 'forward' or 'reverse'. Default is forward.
* </ul>
* @see Transliterator
*/
public class ICUTransformFilterFactory extends BaseTokenFilterFactory {
private Transliterator transliterator;
// TODO: add support for custom rules
@Override
public void init(Map<String,String> args) {
super.init(args);
String id = args.get("id");
if (id == null) {
throw new SolrException(ErrorCode.SERVER_ERROR, "id is required.");
}
int dir;
String direction = args.get("direction");
if (direction == null || direction.equalsIgnoreCase("forward"))
dir = Transliterator.FORWARD;
else if (direction.equalsIgnoreCase("reverse"))
dir = Transliterator.REVERSE;
else
throw new SolrException(ErrorCode.SERVER_ERROR, "invalid direction: " + direction);
transliterator = Transliterator.getInstance(id, dir);
}
public TokenStream create(TokenStream input) {
return new ICUTransformFilter(input, transliterator);
}
}

View File

@ -0,0 +1,170 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.solr.common.ResourceLoader;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
public class TestICUCollationKeyFilterFactory extends BaseTokenTestCase {
/*
* Turkish has some funny casing.
* This test shows how you can solve this kind of thing easily with collation.
* Instead of using LowerCaseFilter, use a turkish collator with primary strength.
* Then things will sort and match correctly.
*/
public void testBasicUsage() throws IOException {
String turkishUpperCase = "I WİLL USE TURKİSH CASING";
String turkishLowerCase = "ı will use turkish casıng";
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("locale", "tr");
args.put("strength", "primary");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsUpper = factory.create(
new KeywordTokenizer(new StringReader(turkishUpperCase)));
TokenStream tsLower = factory.create(
new KeywordTokenizer(new StringReader(turkishLowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Test usage of the decomposition option for unicode normalization.
*/
public void testNormalization() throws IOException {
String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
String turkishLowerCase = "ı will use turkish casıng";
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("locale", "tr");
args.put("strength", "primary");
args.put("decomposition", "canonical");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsUpper = factory.create(
new KeywordTokenizer(new StringReader(turkishUpperCase)));
TokenStream tsLower = factory.create(
new KeywordTokenizer(new StringReader(turkishLowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Test secondary strength, for english case is not significant.
*/
public void testSecondaryStrength() throws IOException {
String upperCase = "TESTING";
String lowerCase = "testing";
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("locale", "en");
args.put("strength", "secondary");
args.put("decomposition", "no");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsUpper = factory.create(
new KeywordTokenizer(new StringReader(upperCase)));
TokenStream tsLower = factory.create(
new KeywordTokenizer(new StringReader(lowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* For german, you might want oe to sort and match with o umlaut.
* This is not the default, but you can make a customized ruleset to do this.
*
* The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
* http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
*/
public void testCustomRules() throws Exception {
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
String DIN5007_2_tailorings =
"& ae , a\u0308 & AE , A\u0308"+
"& oe , o\u0308 & OE , O\u0308"+
"& ue , u\u0308 & UE , u\u0308";
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
String tailoredRules = tailoredCollator.getRules();
//
// at this point, you would save these tailoredRules to a file,
// and use the custom parameter.
//
String germanUmlaut = "Töne";
String germanOE = "Toene";
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("custom", "rules.txt");
args.put("strength", "primary");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(tailoredRules));
TokenStream tsUmlaut = factory.create(
new KeywordTokenizer(new StringReader(germanUmlaut)));
TokenStream tsOE = factory.create(
new KeywordTokenizer(new StringReader(germanOE)));
assertCollatesToSame(tsUmlaut, tsOE);
}
private class StringMockSolrResourceLoader implements ResourceLoader {
String text;
StringMockSolrResourceLoader(String text) {
this.text = text;
}
public List<String> getLines(String resource) throws IOException {
return null;
}
public Object newInstance(String cname, String... subpackages) {
return null;
}
public InputStream openResource(String resource) throws IOException {
return new ByteArrayInputStream(text.getBytes("UTF-8"));
}
}
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
throws IOException {
CharTermAttribute term1 = stream1
.addAttribute(CharTermAttribute.class);
CharTermAttribute term2 = stream2
.addAttribute(CharTermAttribute.class);
assertTrue(stream1.incrementToken());
assertTrue(stream2.incrementToken());
assertEquals(term1.toString(), term2.toString());
assertFalse(stream1.incrementToken());
assertFalse(stream2.incrementToken());
}
}

View File

@ -0,0 +1,39 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/** basic tests for {@link ICUFoldingFilterFactory} */
public class TestICUFoldingFilterFactory extends BaseTokenTestCase {
/** basic tests to ensure the folding is working */
public void test() throws Exception {
Reader reader = new StringReader("Résumé");
ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "resume" });
}
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/** basic tests for {@link ICUNormalizer2FilterFactory} */
public class TestICUNormalizer2FilterFactory extends BaseTokenTestCase {
/** Test nfkc_cf defaults */
public void testDefaults() throws Exception {
Reader reader = new StringReader("This is a ");
ICUNormalizer2FilterFactory factory = new ICUNormalizer2FilterFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "this", "is", "a", "test" });
}
// TODO: add tests for different forms
}

View File

@ -0,0 +1,35 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
/** basic tests for {@link ICUTokenizerFactory} **/
public class TestICUTokenizerFactory extends BaseTokenTestCase {
public void testMixedText() throws Exception {
Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี This is a test ກວ່າດອກ");
ICUTokenizerFactory factory = new ICUTokenizerFactory();
TokenStream stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี",
"This", "is", "a", "test", "ກວ່າ", "ດອກ"});
}
}

View File

@ -0,0 +1,64 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/** basic tests for {@link ICUTransformFilterFactory} */
public class TestICUTransformFilterFactory extends BaseTokenTestCase {
/** ensure the transform is working */
public void test() throws Exception {
Reader reader = new StringReader("簡化字");
ICUTransformFilterFactory factory = new ICUTransformFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("id", "Traditional-Simplified");
factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "简化字" });
}
/** test forward and reverse direction */
public void testDirection() throws Exception {
// forward
Reader reader = new StringReader("Российская Федерация");
ICUTransformFilterFactory factory = new ICUTransformFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("id", "Cyrillic-Latin");
factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Rossijskaâ", "Federaciâ" });
// backward (invokes Latin-Cyrillic)
reader = new StringReader("Rossijskaâ Federaciâ");
args.put("direction", "reverse");
factory.init(args);
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Российская", "Федерация" });
}
}