mirror of https://github.com/apache/lucene.git
SOLR-2210: add factories for icu analyzers
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1030012 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f4a564065b
commit
5c6b4f4f65
|
@ -297,6 +297,8 @@ New Features
|
||||||
built-in load balancing, and infrastructure for future SolrCloud work.
|
built-in load balancing, and infrastructure for future SolrCloud work.
|
||||||
(yonik, Mark Miller)
|
(yonik, Mark Miller)
|
||||||
|
|
||||||
|
* SOLR-2210: Add icu-based tokenizer and filters to contrib/analysis-extras (rmuir)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -34,9 +34,6 @@
|
||||||
<property name="clover.db.dir" location="${dest}/tests/clover/db"/>
|
<property name="clover.db.dir" location="${dest}/tests/clover/db"/>
|
||||||
<property name="clover.report.dir" location="${dest}/tests/clover/reports"/>
|
<property name="clover.report.dir" location="${dest}/tests/clover/reports"/>
|
||||||
|
|
||||||
<!-- change this together with the default and test's solrconfig.xml after starting a new development branch: -->
|
|
||||||
<property name="tests.luceneMatchVersion" value="4.0"/>
|
|
||||||
|
|
||||||
<available
|
<available
|
||||||
property="clover.present"
|
property="clover.present"
|
||||||
classname="com.cenqua.clover.tasks.CloverReportTask"
|
classname="com.cenqua.clover.tasks.CloverReportTask"
|
||||||
|
@ -221,6 +218,7 @@
|
||||||
<packageset dir="contrib/dataimporthandler/src/main/java" />
|
<packageset dir="contrib/dataimporthandler/src/main/java" />
|
||||||
<!--<packageset dir="contrib/clustering/src/main/java" />-->
|
<!--<packageset dir="contrib/clustering/src/main/java" />-->
|
||||||
<packageset dir="contrib/extraction/src/main/java" />
|
<packageset dir="contrib/extraction/src/main/java" />
|
||||||
|
<packageset dir="contrib/analysis-extras/src/java" />
|
||||||
<group title="Core" packages="org.apache.*" />
|
<group title="Core" packages="org.apache.*" />
|
||||||
<group title="Common" packages="org.apache.solr.common.*" />
|
<group title="Common" packages="org.apache.solr.common.*" />
|
||||||
<group title="SolrJ" packages="org.apache.solr.client.solrj*" />
|
<group title="SolrJ" packages="org.apache.solr.client.solrj*" />
|
||||||
|
@ -509,6 +507,7 @@
|
||||||
<fileset dir="contrib/dataimporthandler/src/main/java" />
|
<fileset dir="contrib/dataimporthandler/src/main/java" />
|
||||||
<fileset dir="contrib/clustering/src/main/java" />
|
<fileset dir="contrib/clustering/src/main/java" />
|
||||||
<fileset dir="contrib/extraction/src/main/java" />
|
<fileset dir="contrib/extraction/src/main/java" />
|
||||||
|
<fileset dir="contrib/analysis-extras/src/java" />
|
||||||
</clover-setup>
|
</clover-setup>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
@ -609,6 +608,8 @@
|
||||||
basedir="contrib/extraction/src" />
|
basedir="contrib/extraction/src" />
|
||||||
<!--<solr-jar destfile="${dist}/apache-solr-clustering-src-${version}.jar"
|
<!--<solr-jar destfile="${dist}/apache-solr-clustering-src-${version}.jar"
|
||||||
basedir="contrib/clustering/src" />-->
|
basedir="contrib/clustering/src" />-->
|
||||||
|
<solr-jar destfile="${dist}/apache-solr-analysis-extras-src-${version}.jar"
|
||||||
|
basedir="contrib/analysis-extras/src" />
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="dist-javadoc" description="Creates the Solr javadoc distribution files"
|
<target name="dist-javadoc" description="Creates the Solr javadoc distribution files"
|
||||||
|
@ -625,6 +626,8 @@
|
||||||
basedir="${build.javadoc}/contrib-solr-clustering" />-->
|
basedir="${build.javadoc}/contrib-solr-clustering" />-->
|
||||||
<solr-jar destfile="${dist}/apache-solr-cell-docs-${version}.jar"
|
<solr-jar destfile="${dist}/apache-solr-cell-docs-${version}.jar"
|
||||||
basedir="${build.javadoc}/contrib-solr-cell" />
|
basedir="${build.javadoc}/contrib-solr-cell" />
|
||||||
|
<solr-jar destfile="${dist}/apache-solr-analysis-extras-docs-${version}.jar"
|
||||||
|
basedir="${build.javadoc}/contrib-solr-analysis-extras" />
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<!-- Creates the solr jar. -->
|
<!-- Creates the solr jar. -->
|
||||||
|
@ -721,7 +724,7 @@
|
||||||
<tarfileset dir="."
|
<tarfileset dir="."
|
||||||
prefix="${fullnamever}"
|
prefix="${fullnamever}"
|
||||||
includes="LICENSE.txt NOTICE.txt *.txt *.xml lucene-libs/** lib/** src/** example/** client/** contrib/"
|
includes="LICENSE.txt NOTICE.txt *.txt *.xml lucene-libs/** lib/** src/** example/** client/** contrib/"
|
||||||
excludes="lib/README.committers.txt **/data/ **/logs/* **/classes/ **/*.sh **/bin/ src/scripts/ src/site/build/ **/target/ client/ruby/flare/ client/python contrib/**/build/ **/*.iml **/*.ipr **/*.iws contrib/clustering/example/lib/** contrib/clustering/lib/downloads/**" />
|
excludes="lib/README.committers.txt **/data/ **/logs/* **/classes/ **/*.sh **/bin/ src/scripts/ src/site/build/ **/target/ client/ruby/flare/ client/python contrib/**/build/ **/*.iml **/*.ipr **/*.iws contrib/clustering/example/lib/** contrib/clustering/lib/downloads/** contrib/analysis-extras/lib/**" />
|
||||||
<tarfileset dir="."
|
<tarfileset dir="."
|
||||||
prefix="${fullnamever}"
|
prefix="${fullnamever}"
|
||||||
includes="src/test/test-files/solr/lib/classes/empty-file-main-lib.txt" />
|
includes="src/test/test-files/solr/lib/classes/empty-file-main-lib.txt" />
|
||||||
|
@ -952,6 +955,8 @@
|
||||||
<fileset dir="contrib/clustering/src/test/java"/>
|
<fileset dir="contrib/clustering/src/test/java"/>
|
||||||
<fileset dir="contrib/extraction/src/main/java"/>
|
<fileset dir="contrib/extraction/src/main/java"/>
|
||||||
<fileset dir="contrib/extraction/src/test/java"/>
|
<fileset dir="contrib/extraction/src/test/java"/>
|
||||||
|
<fileset dir="contrib/analysis-extras/src/test"/>
|
||||||
|
<fileset dir="contrib/analysis-extras/src/test"/>
|
||||||
</rat:report>
|
</rat:report>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,9 @@
|
||||||
|
|
||||||
<dirname file="${ant.file.common-solr}" property="common-solr.dir"/>
|
<dirname file="${ant.file.common-solr}" property="common-solr.dir"/>
|
||||||
|
|
||||||
|
<!-- change this together with the default and test's solrconfig.xml after starting a new development branch: -->
|
||||||
|
<property name="tests.luceneMatchVersion" value="4.0"/>
|
||||||
|
|
||||||
<!-- Initialize property values: allow easy customization via build.properties -->
|
<!-- Initialize property values: allow easy customization via build.properties -->
|
||||||
<property file="build.properties" />
|
<property file="build.properties" />
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
The analysis-extras plugin provides additional analyzers that rely
|
||||||
|
upon large dependencies/dictionaries.
|
||||||
|
|
||||||
|
It includes integration with ICU for multilingual support, and
|
||||||
|
analyzers for Chinese and Polish.
|
||||||
|
|
||||||
|
Relies upon the following lucene components (in lucene-libs/):
|
||||||
|
|
||||||
|
* lucene-analyzers-icu-X.Y.jar
|
||||||
|
* lucene-analyzers-smartcn-X.Y.jar
|
||||||
|
* lucene-analyzers-stempel-X.Y.jar
|
||||||
|
|
||||||
|
And the ICU library (in lib/):
|
||||||
|
|
||||||
|
* icu4j-X.Y.jar
|
||||||
|
|
|
@ -0,0 +1,203 @@
|
||||||
|
<?xml version="1.0"?>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project name="solr-extraAnalyzers" default="build">
|
||||||
|
|
||||||
|
<property name="solr-path" value="../.."/>
|
||||||
|
|
||||||
|
<import file="../../common-build.xml"/>
|
||||||
|
|
||||||
|
<description>
|
||||||
|
Additional analysis components
|
||||||
|
</description>
|
||||||
|
|
||||||
|
<property name="example.local" value="example"/>
|
||||||
|
|
||||||
|
<!-- support for the additional analyzers modules -->
|
||||||
|
<path id="modules.classpath">
|
||||||
|
<pathelement location="${common-solr.dir}/../modules/analysis/build/icu/classes/java" />
|
||||||
|
<pathelement location="${common-solr.dir}/../modules/analysis/build/smartcn/classes/java" />
|
||||||
|
<pathelement location="${common-solr.dir}/../modules/analysis/build/stempel/classes/java" />
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<target name="prep-module-jars">
|
||||||
|
<subant target="jar" inheritall="false" failonerror="true">
|
||||||
|
<fileset dir="${common-solr.dir}/../modules/analysis/icu" includes="build.xml" />
|
||||||
|
<fileset dir="${common-solr.dir}/../modules/analysis/smartcn" includes="build.xml" />
|
||||||
|
<fileset dir="${common-solr.dir}/../modules/analysis/stempel" includes="build.xml" />
|
||||||
|
</subant>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="module-jars-to-solr" depends="prep-module-jars">
|
||||||
|
<mkdir dir="${lucene-libs}"/>
|
||||||
|
<copy todir="${lucene-libs}" preservelastmodified="true" flatten="true" failonerror="true" overwrite="true">
|
||||||
|
<fileset dir="${common-solr.dir}/../modules/analysis/build/icu">
|
||||||
|
<include name="lucene-analyzers-icu-${version}.jar" />
|
||||||
|
</fileset>
|
||||||
|
<fileset dir="${common-solr.dir}/../modules/analysis/build/smartcn">
|
||||||
|
<include name="lucene-analyzers-smartcn-${version}.jar" />
|
||||||
|
</fileset>
|
||||||
|
<fileset dir="${common-solr.dir}/../modules/analysis/build/stempel">
|
||||||
|
<include name="lucene-analyzers-stempel-${version}.jar" />
|
||||||
|
</fileset>
|
||||||
|
</copy>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<path id="common.classpath">
|
||||||
|
<fileset dir="lib"/>
|
||||||
|
<pathelement location="${solr-path}/build/solr"/>
|
||||||
|
<pathelement location="${solr-path}/build/solrj"/>
|
||||||
|
<path refid="lucene.classpath"/>
|
||||||
|
<path refid="modules.classpath"/>
|
||||||
|
<fileset dir="${solr-path}/lib" includes="*.jar"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<path id="test.classpath">
|
||||||
|
<pathelement path="${dest}/classes"/>
|
||||||
|
<pathelement path="${dest}/test-classes"/>
|
||||||
|
<pathelement path="${java.class.path}"/>
|
||||||
|
<pathelement location="${common-solr.dir}/build/tests"/> <!-- include solr test code -->
|
||||||
|
<pathelement location="${common-solr.dir}/../lucene/build/classes/test" /> <!-- include some lucene test code -->
|
||||||
|
<path refid="common.classpath"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<target name="clean">
|
||||||
|
<delete failonerror="false" dir="${dest}"/>
|
||||||
|
|
||||||
|
<!-- example doesn't create this anymore, but clean it up
|
||||||
|
if it's still there from an old build
|
||||||
|
-->
|
||||||
|
<delete dir="example/lib" />
|
||||||
|
<delete dir="${lucene-libs}" />
|
||||||
|
</target>
|
||||||
|
|
||||||
|
|
||||||
|
<target name="init" depends="module-jars-to-solr">
|
||||||
|
<mkdir dir="${dest}/classes"/>
|
||||||
|
|
||||||
|
<mkdir dir="${build.javadoc}"/>
|
||||||
|
<subant target="compileTests">
|
||||||
|
<fileset dir="${solr-path}" includes="build.xml"/>
|
||||||
|
</subant>
|
||||||
|
<subant target="make-manifest">
|
||||||
|
<fileset dir="${solr-path}" includes="build.xml"/>
|
||||||
|
</subant>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
|
||||||
|
<target name="compile" depends="init">
|
||||||
|
<solr-javac destdir="${dest}/classes"
|
||||||
|
classpathref="common.classpath">
|
||||||
|
<src path="src/java"/>
|
||||||
|
</solr-javac>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="build" depends="compile">
|
||||||
|
<solr-jar destfile="${dest}/${fullnamever}.jar" basedir="${dest}/classes"
|
||||||
|
manifest="../../${dest}/META-INF/MANIFEST.MF"/>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="compileTests" depends="compile">
|
||||||
|
<solr-javac destdir="${dest}/test-classes"
|
||||||
|
classpathref="test.classpath">
|
||||||
|
<src path="src/test"/>
|
||||||
|
</solr-javac>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="example" depends="build,dist">
|
||||||
|
<!-- this task use to copy lib's but that's no longer needed because
|
||||||
|
../lib and ../lib/downloads are now included explicitly by
|
||||||
|
example/conf/solrconfig.xml
|
||||||
|
-->
|
||||||
|
</target>
|
||||||
|
|
||||||
|
|
||||||
|
<target name="test" depends="compileTests">
|
||||||
|
<mkdir dir="${junit.output.dir}"/>
|
||||||
|
|
||||||
|
<junit printsummary="no"
|
||||||
|
haltonfailure="no"
|
||||||
|
maxmemory="512M"
|
||||||
|
errorProperty="tests.failed"
|
||||||
|
failureProperty="tests.failed"
|
||||||
|
dir="src/test/test-files/"
|
||||||
|
tempdir="${junit.output.dir}"
|
||||||
|
forkmode="perBatch"
|
||||||
|
>
|
||||||
|
<sysproperty key="java.util.logging.config.file" value="${common-solr.dir}/testlogging.properties"/>
|
||||||
|
<sysproperty key="tests.luceneMatchVersion" value="${tests.luceneMatchVersion}"/>
|
||||||
|
<sysproperty key="tests.codec" value="${tests.codec}"/>
|
||||||
|
<sysproperty key="tests.locale" value="${tests.locale}"/>
|
||||||
|
<sysproperty key="tests.timezone" value="${tests.timezone}"/>
|
||||||
|
<sysproperty key="tests.multiplier" value="${tests.multiplier}"/>
|
||||||
|
<sysproperty key="tests.seed" value="${tests.seed}"/>
|
||||||
|
<sysproperty key="tests.iter" value="${tests.iter}"/>
|
||||||
|
<sysproperty key="jetty.testMode" value="1"/>
|
||||||
|
<sysproperty key="tempDir" file="${junit.output.dir}"/>
|
||||||
|
<sysproperty key="testmethod" value="${testmethod}"/>
|
||||||
|
<jvmarg line="${args}"/>
|
||||||
|
<formatter classname="${junit.details.formatter}" usefile="false" if="junit.details"/>
|
||||||
|
<classpath refid="test.classpath"/>
|
||||||
|
<assertions>
|
||||||
|
<enable package="org.apache.lucene"/>
|
||||||
|
<enable package="org.apache.solr"/>
|
||||||
|
</assertions>
|
||||||
|
<formatter type="${junit.formatter}"/>
|
||||||
|
<batchtest fork="yes" todir="${junit.output.dir}" unless="testcase">
|
||||||
|
<fileset dir="src/test" includes="${junit.includes}"/>
|
||||||
|
</batchtest>
|
||||||
|
<batchtest fork="yes" todir="${junit.output.dir}" if="testcase">
|
||||||
|
<fileset dir="src/test" includes="**/${testcase}.java"/>
|
||||||
|
</batchtest>
|
||||||
|
</junit>
|
||||||
|
|
||||||
|
<fail if="tests.failed">Tests failed!</fail>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="dist" depends="build">
|
||||||
|
<!--
|
||||||
|
<copy file="${dest}/${fullnamever}.jar" todir="${solr-path}/build/web/WEB-INF/lib"/>
|
||||||
|
<copy todir="${solr-path}/build/web/WEB-INF/lib" flatten="true">
|
||||||
|
<fileset dir="lib">
|
||||||
|
<include name="**/*.jar"/>
|
||||||
|
</fileset>
|
||||||
|
</copy>
|
||||||
|
-->
|
||||||
|
<copy file="${dest}/${fullnamever}.jar" todir="${solr-path}/dist"/>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="javadoc">
|
||||||
|
<sequential>
|
||||||
|
<mkdir dir="${build.javadoc}/contrib-${name}"/>
|
||||||
|
|
||||||
|
<path id="javadoc.classpath">
|
||||||
|
<path refid="common.classpath"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<invoke-javadoc
|
||||||
|
destdir="${build.javadoc}/contrib-${name}"
|
||||||
|
title="${Name} ${version} contrib-${fullnamever} API">
|
||||||
|
<sources>
|
||||||
|
<packageset dir="src/java"/>
|
||||||
|
</sources>
|
||||||
|
</invoke-javadoc>
|
||||||
|
</sequential>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
</project>
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[4d9d4e1277822f7a08dd9469ae2ca81d44902552] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,142 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.collation.ICUCollationKeyFilter;
|
||||||
|
import org.apache.solr.common.ResourceLoader;
|
||||||
|
import org.apache.solr.common.SolrException;
|
||||||
|
import org.apache.solr.common.SolrException.ErrorCode;
|
||||||
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
|
|
||||||
|
import com.ibm.icu.text.Collator;
|
||||||
|
import com.ibm.icu.text.RuleBasedCollator;
|
||||||
|
import com.ibm.icu.util.ULocale;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link ICUCollationKeyFilter}.
|
||||||
|
* <p>
|
||||||
|
* This factory can be created in two ways:
|
||||||
|
* <ul>
|
||||||
|
* <li>Based upon a system collator associated with a Locale.
|
||||||
|
* <li>Based upon a tailored ruleset.
|
||||||
|
* </ul>
|
||||||
|
* <p>
|
||||||
|
* Using a System collator:
|
||||||
|
* <ul>
|
||||||
|
* <li>locale: RFC 3066 locale ID (mandatory)
|
||||||
|
* <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
|
||||||
|
* <li>decomposition: 'no', or 'canonical' (optional)
|
||||||
|
* </ul>
|
||||||
|
* <p>
|
||||||
|
* Using a Tailored ruleset:
|
||||||
|
* <ul>
|
||||||
|
* <li>custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)
|
||||||
|
* <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
|
||||||
|
* <li>decomposition: 'no' or 'canonical' (optional)
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* @see Collator
|
||||||
|
* @see ULocale
|
||||||
|
* @see RuleBasedCollator
|
||||||
|
*/
|
||||||
|
public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||||
|
private Collator collator;
|
||||||
|
|
||||||
|
public void inform(ResourceLoader loader) {
|
||||||
|
String custom = args.get("custom");
|
||||||
|
String localeID = args.get("locale");
|
||||||
|
String strength = args.get("strength");
|
||||||
|
String decomposition = args.get("decomposition");
|
||||||
|
|
||||||
|
if (custom == null && localeID == null)
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
|
||||||
|
|
||||||
|
if (custom != null && localeID != null)
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. "
|
||||||
|
+ "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. "
|
||||||
|
+ "Then save the entire customized ruleset to a file, and use with the custom parameter");
|
||||||
|
|
||||||
|
if (localeID != null) {
|
||||||
|
// create from a system collator, based on Locale.
|
||||||
|
collator = createFromLocale(localeID);
|
||||||
|
} else {
|
||||||
|
// create from a custom ruleset
|
||||||
|
collator = createFromRules(custom, loader);
|
||||||
|
}
|
||||||
|
|
||||||
|
// set the strength flag, otherwise it will be the default.
|
||||||
|
if (strength != null) {
|
||||||
|
if (strength.equalsIgnoreCase("primary"))
|
||||||
|
collator.setStrength(Collator.PRIMARY);
|
||||||
|
else if (strength.equalsIgnoreCase("secondary"))
|
||||||
|
collator.setStrength(Collator.SECONDARY);
|
||||||
|
else if (strength.equalsIgnoreCase("tertiary"))
|
||||||
|
collator.setStrength(Collator.TERTIARY);
|
||||||
|
else if (strength.equalsIgnoreCase("quaternary"))
|
||||||
|
collator.setStrength(Collator.QUATERNARY);
|
||||||
|
else if (strength.equalsIgnoreCase("identical"))
|
||||||
|
collator.setStrength(Collator.IDENTICAL);
|
||||||
|
else
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
|
||||||
|
}
|
||||||
|
|
||||||
|
// set the decomposition flag, otherwise it will be the default.
|
||||||
|
if (decomposition != null) {
|
||||||
|
if (decomposition.equalsIgnoreCase("no"))
|
||||||
|
collator.setDecomposition(Collator.NO_DECOMPOSITION);
|
||||||
|
else if (decomposition.equalsIgnoreCase("canonical"))
|
||||||
|
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
|
||||||
|
else
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new ICUCollationKeyFilter(input, collator);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create a locale from localeID.
|
||||||
|
* Then return the appropriate collator for the locale.
|
||||||
|
*/
|
||||||
|
private Collator createFromLocale(String localeID) {
|
||||||
|
return Collator.getInstance(new ULocale(localeID));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Read custom rules from a file, and create a RuleBasedCollator
|
||||||
|
* The file cannot support comments, as # might be in the rules!
|
||||||
|
*/
|
||||||
|
private Collator createFromRules(String fileName, ResourceLoader loader) {
|
||||||
|
InputStream input = null;
|
||||||
|
try {
|
||||||
|
input = loader.openResource(fileName);
|
||||||
|
String rules = IOUtils.toString(input, "UTF-8");
|
||||||
|
return new RuleBasedCollator(rules);
|
||||||
|
} catch (Exception e) {
|
||||||
|
// io error or invalid rules
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
} finally {
|
||||||
|
IOUtils.closeQuietly(input);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.icu.ICUFoldingFilter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** Factory for {@link ICUFoldingFilter} */
|
||||||
|
public class ICUFoldingFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new ICUFoldingFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,81 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
|
||||||
|
import org.apache.solr.common.SolrException;
|
||||||
|
import org.apache.solr.common.SolrException.ErrorCode;
|
||||||
|
|
||||||
|
import com.ibm.icu.text.FilteredNormalizer2;
|
||||||
|
import com.ibm.icu.text.Normalizer2;
|
||||||
|
import com.ibm.icu.text.UnicodeSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link ICUNormalizer2Filter}
|
||||||
|
* <p>
|
||||||
|
* Supports the following attributes:
|
||||||
|
* <ul>
|
||||||
|
* <li>name: A <a href="http://unicode.org/reports/tr15/">Unicode Normalization Form</a>,
|
||||||
|
* one of 'nfc','nfkc', 'nfkc_cf'. Default is nfkc_cf.
|
||||||
|
* <li>mode: Either 'compose' or 'decompose'. Default is compose. Use "decompose" with nfc
|
||||||
|
* or nfkc, to get nfd or nfkd, respectively.
|
||||||
|
* <li>filter: A {@link UnicodeSet} pattern. Codepoints outside the set are
|
||||||
|
* always left unchanged. Default is [] (the null set, no filtering).
|
||||||
|
* </ul>
|
||||||
|
* @see ICUNormalizer2Filter
|
||||||
|
* @see Normalizer2
|
||||||
|
* @see FilteredNormalizer2
|
||||||
|
*/
|
||||||
|
public class ICUNormalizer2FilterFactory extends BaseTokenFilterFactory {
|
||||||
|
private Normalizer2 normalizer;
|
||||||
|
|
||||||
|
// TODO: support custom normalization
|
||||||
|
@Override
|
||||||
|
public void init(Map<String,String> args) {
|
||||||
|
super.init(args);
|
||||||
|
String name = args.get("name");
|
||||||
|
if (name == null)
|
||||||
|
name = "nfkc_cf";
|
||||||
|
String mode = args.get("mode");
|
||||||
|
if (mode == null)
|
||||||
|
mode = "compose";
|
||||||
|
|
||||||
|
if (mode.equals("compose"))
|
||||||
|
normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE);
|
||||||
|
else if (mode.equals("decompose"))
|
||||||
|
normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.DECOMPOSE);
|
||||||
|
else
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid mode: " + mode);
|
||||||
|
|
||||||
|
String filter = args.get("filter");
|
||||||
|
if (filter != null) {
|
||||||
|
UnicodeSet set = new UnicodeSet(filter);
|
||||||
|
if (!set.isEmpty()) {
|
||||||
|
set.freeze();
|
||||||
|
normalizer = new FilteredNormalizer2(normalizer, set);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new ICUNormalizer2Filter(input, normalizer);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,32 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
|
||||||
|
|
||||||
|
/** Factory for {@link ICUTokenizer} */
|
||||||
|
public class ICUTokenizerFactory extends BaseTokenizerFactory {
|
||||||
|
// TODO: add support for custom configs
|
||||||
|
@Override
|
||||||
|
public Tokenizer create(Reader input) {
|
||||||
|
return new ICUTokenizer(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,67 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.icu.ICUTransformFilter;
|
||||||
|
import org.apache.solr.analysis.BaseTokenFilterFactory;
|
||||||
|
import org.apache.solr.common.SolrException;
|
||||||
|
import org.apache.solr.common.SolrException.ErrorCode;
|
||||||
|
|
||||||
|
import com.ibm.icu.text.Transliterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link ICUTransformFilter}.
|
||||||
|
* <p>
|
||||||
|
* Supports the following attributes:
|
||||||
|
* <ul>
|
||||||
|
* <li>id (mandatory): A Transliterator ID, one from {@link Transliterator#getAvailableIDs()}
|
||||||
|
* <li>direction (optional): Either 'forward' or 'reverse'. Default is forward.
|
||||||
|
* </ul>
|
||||||
|
* @see Transliterator
|
||||||
|
*/
|
||||||
|
public class ICUTransformFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
private Transliterator transliterator;
|
||||||
|
|
||||||
|
// TODO: add support for custom rules
|
||||||
|
@Override
|
||||||
|
public void init(Map<String,String> args) {
|
||||||
|
super.init(args);
|
||||||
|
String id = args.get("id");
|
||||||
|
if (id == null) {
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR, "id is required.");
|
||||||
|
}
|
||||||
|
|
||||||
|
int dir;
|
||||||
|
String direction = args.get("direction");
|
||||||
|
if (direction == null || direction.equalsIgnoreCase("forward"))
|
||||||
|
dir = Transliterator.FORWARD;
|
||||||
|
else if (direction.equalsIgnoreCase("reverse"))
|
||||||
|
dir = Transliterator.REVERSE;
|
||||||
|
else
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR, "invalid direction: " + direction);
|
||||||
|
|
||||||
|
transliterator = Transliterator.getInstance(id, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new ICUTransformFilter(input, transliterator);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,170 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.solr.common.ResourceLoader;
|
||||||
|
|
||||||
|
import com.ibm.icu.text.Collator;
|
||||||
|
import com.ibm.icu.text.RuleBasedCollator;
|
||||||
|
import com.ibm.icu.util.ULocale;
|
||||||
|
|
||||||
|
public class TestICUCollationKeyFilterFactory extends BaseTokenTestCase {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Turkish has some funny casing.
|
||||||
|
* This test shows how you can solve this kind of thing easily with collation.
|
||||||
|
* Instead of using LowerCaseFilter, use a turkish collator with primary strength.
|
||||||
|
* Then things will sort and match correctly.
|
||||||
|
*/
|
||||||
|
public void testBasicUsage() throws IOException {
|
||||||
|
String turkishUpperCase = "I WİLL USE TURKİSH CASING";
|
||||||
|
String turkishLowerCase = "ı will use turkish casıng";
|
||||||
|
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("locale", "tr");
|
||||||
|
args.put("strength", "primary");
|
||||||
|
factory.init(args);
|
||||||
|
factory.inform(new StringMockSolrResourceLoader(""));
|
||||||
|
TokenStream tsUpper = factory.create(
|
||||||
|
new KeywordTokenizer(new StringReader(turkishUpperCase)));
|
||||||
|
TokenStream tsLower = factory.create(
|
||||||
|
new KeywordTokenizer(new StringReader(turkishLowerCase)));
|
||||||
|
assertCollatesToSame(tsUpper, tsLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test usage of the decomposition option for unicode normalization.
|
||||||
|
*/
|
||||||
|
public void testNormalization() throws IOException {
|
||||||
|
String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
|
||||||
|
String turkishLowerCase = "ı will use turkish casıng";
|
||||||
|
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("locale", "tr");
|
||||||
|
args.put("strength", "primary");
|
||||||
|
args.put("decomposition", "canonical");
|
||||||
|
factory.init(args);
|
||||||
|
factory.inform(new StringMockSolrResourceLoader(""));
|
||||||
|
TokenStream tsUpper = factory.create(
|
||||||
|
new KeywordTokenizer(new StringReader(turkishUpperCase)));
|
||||||
|
TokenStream tsLower = factory.create(
|
||||||
|
new KeywordTokenizer(new StringReader(turkishLowerCase)));
|
||||||
|
assertCollatesToSame(tsUpper, tsLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test secondary strength, for english case is not significant.
|
||||||
|
*/
|
||||||
|
public void testSecondaryStrength() throws IOException {
|
||||||
|
String upperCase = "TESTING";
|
||||||
|
String lowerCase = "testing";
|
||||||
|
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("locale", "en");
|
||||||
|
args.put("strength", "secondary");
|
||||||
|
args.put("decomposition", "no");
|
||||||
|
factory.init(args);
|
||||||
|
factory.inform(new StringMockSolrResourceLoader(""));
|
||||||
|
TokenStream tsUpper = factory.create(
|
||||||
|
new KeywordTokenizer(new StringReader(upperCase)));
|
||||||
|
TokenStream tsLower = factory.create(
|
||||||
|
new KeywordTokenizer(new StringReader(lowerCase)));
|
||||||
|
assertCollatesToSame(tsUpper, tsLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For german, you might want oe to sort and match with o umlaut.
|
||||||
|
* This is not the default, but you can make a customized ruleset to do this.
|
||||||
|
*
|
||||||
|
* The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
|
||||||
|
* http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
|
||||||
|
*/
|
||||||
|
public void testCustomRules() throws Exception {
|
||||||
|
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
|
||||||
|
|
||||||
|
String DIN5007_2_tailorings =
|
||||||
|
"& ae , a\u0308 & AE , A\u0308"+
|
||||||
|
"& oe , o\u0308 & OE , O\u0308"+
|
||||||
|
"& ue , u\u0308 & UE , u\u0308";
|
||||||
|
|
||||||
|
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
|
||||||
|
String tailoredRules = tailoredCollator.getRules();
|
||||||
|
//
|
||||||
|
// at this point, you would save these tailoredRules to a file,
|
||||||
|
// and use the custom parameter.
|
||||||
|
//
|
||||||
|
String germanUmlaut = "Töne";
|
||||||
|
String germanOE = "Toene";
|
||||||
|
ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("custom", "rules.txt");
|
||||||
|
args.put("strength", "primary");
|
||||||
|
factory.init(args);
|
||||||
|
factory.inform(new StringMockSolrResourceLoader(tailoredRules));
|
||||||
|
TokenStream tsUmlaut = factory.create(
|
||||||
|
new KeywordTokenizer(new StringReader(germanUmlaut)));
|
||||||
|
TokenStream tsOE = factory.create(
|
||||||
|
new KeywordTokenizer(new StringReader(germanOE)));
|
||||||
|
|
||||||
|
assertCollatesToSame(tsUmlaut, tsOE);
|
||||||
|
}
|
||||||
|
|
||||||
|
private class StringMockSolrResourceLoader implements ResourceLoader {
|
||||||
|
String text;
|
||||||
|
|
||||||
|
StringMockSolrResourceLoader(String text) {
|
||||||
|
this.text = text;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getLines(String resource) throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object newInstance(String cname, String... subpackages) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public InputStream openResource(String resource) throws IOException {
|
||||||
|
return new ByteArrayInputStream(text.getBytes("UTF-8"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
|
||||||
|
throws IOException {
|
||||||
|
CharTermAttribute term1 = stream1
|
||||||
|
.addAttribute(CharTermAttribute.class);
|
||||||
|
CharTermAttribute term2 = stream2
|
||||||
|
.addAttribute(CharTermAttribute.class);
|
||||||
|
assertTrue(stream1.incrementToken());
|
||||||
|
assertTrue(stream2.incrementToken());
|
||||||
|
assertEquals(term1.toString(), term2.toString());
|
||||||
|
assertFalse(stream1.incrementToken());
|
||||||
|
assertFalse(stream2.incrementToken());
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,39 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/** basic tests for {@link ICUFoldingFilterFactory} */
|
||||||
|
public class TestICUFoldingFilterFactory extends BaseTokenTestCase {
|
||||||
|
|
||||||
|
/** basic tests to ensure the folding is working */
|
||||||
|
public void test() throws Exception {
|
||||||
|
Reader reader = new StringReader("Résumé");
|
||||||
|
ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory();
|
||||||
|
factory.init(DEFAULT_VERSION_PARAM);
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
|
TokenStream stream = factory.create(tokenizer);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "resume" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,41 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/** basic tests for {@link ICUNormalizer2FilterFactory} */
|
||||||
|
public class TestICUNormalizer2FilterFactory extends BaseTokenTestCase {
|
||||||
|
|
||||||
|
/** Test nfkc_cf defaults */
|
||||||
|
public void testDefaults() throws Exception {
|
||||||
|
Reader reader = new StringReader("This is a Test");
|
||||||
|
ICUNormalizer2FilterFactory factory = new ICUNormalizer2FilterFactory();
|
||||||
|
factory.init(DEFAULT_VERSION_PARAM);
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
|
TokenStream stream = factory.create(tokenizer);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "this", "is", "a", "test" });
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: add tests for different forms
|
||||||
|
}
|
|
@ -0,0 +1,35 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
/** basic tests for {@link ICUTokenizerFactory} **/
|
||||||
|
public class TestICUTokenizerFactory extends BaseTokenTestCase {
|
||||||
|
public void testMixedText() throws Exception {
|
||||||
|
Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี This is a test ກວ່າດອກ");
|
||||||
|
ICUTokenizerFactory factory = new ICUTokenizerFactory();
|
||||||
|
TokenStream stream = factory.create(reader);
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี",
|
||||||
|
"This", "is", "a", "test", "ກວ່າ", "ດອກ"});
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,64 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/** basic tests for {@link ICUTransformFilterFactory} */
|
||||||
|
public class TestICUTransformFilterFactory extends BaseTokenTestCase {
|
||||||
|
|
||||||
|
/** ensure the transform is working */
|
||||||
|
public void test() throws Exception {
|
||||||
|
Reader reader = new StringReader("簡化字");
|
||||||
|
ICUTransformFilterFactory factory = new ICUTransformFilterFactory();
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("id", "Traditional-Simplified");
|
||||||
|
factory.init(args);
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
|
TokenStream stream = factory.create(tokenizer);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "简化字" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test forward and reverse direction */
|
||||||
|
public void testDirection() throws Exception {
|
||||||
|
// forward
|
||||||
|
Reader reader = new StringReader("Российская Федерация");
|
||||||
|
ICUTransformFilterFactory factory = new ICUTransformFilterFactory();
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("id", "Cyrillic-Latin");
|
||||||
|
factory.init(args);
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
|
TokenStream stream = factory.create(tokenizer);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "Rossijskaâ", "Federaciâ" });
|
||||||
|
|
||||||
|
// backward (invokes Latin-Cyrillic)
|
||||||
|
reader = new StringReader("Rossijskaâ Federaciâ");
|
||||||
|
args.put("direction", "reverse");
|
||||||
|
factory.init(args);
|
||||||
|
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
|
stream = factory.create(tokenizer);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "Российская", "Федерация" });
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue