mirror of https://github.com/apache/lucene.git
SOLR-1804: Re-integrated Carrot2
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@988129 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ad8bfb3bc4
commit
9b62d49e92
|
@ -234,7 +234,7 @@ New Features
|
||||||
|
|
||||||
* SOLR-1682: (SOLR-236, SOLR-237, SOLR-1773, SOLR-1311) Search grouping / Field collapsing.
|
* SOLR-1682: (SOLR-236, SOLR-237, SOLR-1773, SOLR-1311) Search grouping / Field collapsing.
|
||||||
(Martijn van Groningen, Emmanuel Keller, Shalin Shekhar Mangar,
|
(Martijn van Groningen, Emmanuel Keller, Shalin Shekhar Mangar,
|
||||||
Koji Sekiguchi, Iván de Prado, Ryan McKinley, Marc Sturlese, Peter Karich,
|
Koji Sekiguchi, Iv<EFBFBD>n de Prado, Ryan McKinley, Marc Sturlese, Peter Karich,
|
||||||
Bojan Smid, Charles Hornberger, Dieter Grad, Dmitry Lihachev, Doug Steigerwald,
|
Bojan Smid, Charles Hornberger, Dieter Grad, Dmitry Lihachev, Doug Steigerwald,
|
||||||
Karsten Sperling, Michael Gundlach, Oleg Gnatovskiy, Thomas Traeger, yonik)
|
Karsten Sperling, Michael Gundlach, Oleg Gnatovskiy, Thomas Traeger, yonik)
|
||||||
|
|
||||||
|
@ -521,6 +521,8 @@ Other Changes
|
||||||
* SOLR-2003: SolrResourceLoader will report any encoding errors, rather than
|
* SOLR-2003: SolrResourceLoader will report any encoding errors, rather than
|
||||||
silently using replacement characters for invalid inputs (blargy via rmuir)
|
silently using replacement characters for invalid inputs (blargy via rmuir)
|
||||||
|
|
||||||
|
* SOLR-1804: Google collections updated to Google Guava (which is a superset of collections and contains bug fixes) (gsingers)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -778,26 +778,9 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
==========================================================================
|
|
||||||
EHCache
|
|
||||||
/**
|
|
||||||
* Copyright 2003-2008 Luck Consulting Pty Ltd
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
==========================================================================
|
==========================================================================
|
||||||
Google Collections
|
Guava
|
||||||
/**
|
/**
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
* you may not use this file except in compliance with the License.
|
* you may not use this file except in compliance with the License.
|
||||||
|
|
|
@ -148,7 +148,7 @@ Copyright (c) 2000-2005 INRIA, France Telecom
|
||||||
=========================================================================
|
=========================================================================
|
||||||
== Carrot2 Notice ==
|
== Carrot2 Notice ==
|
||||||
=========================================================================
|
=========================================================================
|
||||||
Copyright (C) 2002-2008, Dawid Weiss, Stanislaw Osinski.
|
Copyright (C) 2002-2010, Dawid Weiss, Stanislaw Osinski.
|
||||||
Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file.
|
Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file.
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
|
|
||||||
|
@ -156,24 +156,16 @@ This product includes software developed by the Carrot2 Project.
|
||||||
|
|
||||||
See http://project.carrot2.org/
|
See http://project.carrot2.org/
|
||||||
|
|
||||||
=========================================================================
|
|
||||||
== EHCache Notice ==
|
|
||||||
=========================================================================
|
|
||||||
Copyright 2003-2008 Luck Consulting Pty Ltd
|
|
||||||
|
|
||||||
This product includes software developed by the EHCache Project
|
|
||||||
|
|
||||||
See ????
|
|
||||||
|
|
||||||
=========================================================================
|
=========================================================================
|
||||||
== Google Collections Notice ==
|
== Guava Notice ==
|
||||||
=========================================================================
|
=========================================================================
|
||||||
|
|
||||||
Copyright ???? Google, Inc.
|
Copyright ???? Google, Inc.
|
||||||
|
|
||||||
This product includes software developed by the Google Collections project.
|
This product includes software developed by the Google Guava project.
|
||||||
|
|
||||||
See ????
|
See http://code.google.com/p/guava-libraries/
|
||||||
|
|
||||||
=========================================================================
|
=========================================================================
|
||||||
== Jackson Notice ==
|
== Jackson Notice ==
|
||||||
|
@ -182,7 +174,7 @@ Copyright ????
|
||||||
|
|
||||||
This product includes software developed by the Jackson project.
|
This product includes software developed by the Jackson project.
|
||||||
|
|
||||||
See ????
|
See http://jackson.codehaus.org/
|
||||||
|
|
||||||
=========================================================================
|
=========================================================================
|
||||||
== HSQLDB Notice ==
|
== HSQLDB Notice ==
|
||||||
|
|
|
@ -8,12 +8,15 @@ CHANGES
|
||||||
|
|
||||||
$Id:$
|
$Id:$
|
||||||
|
|
||||||
================== Release 1.5-dev ==================
|
================== Release XXXX ==================
|
||||||
|
|
||||||
* SOLR-1684: Switch to use the SolrIndexSearcher.doc(int, Set<String>) method b/c it can use the document cache (gsingers)
|
* SOLR-1684: Switch to use the SolrIndexSearcher.doc(int, Set<String>) method b/c it can use the document cache (gsingers)
|
||||||
|
|
||||||
* SOLR-1692: Fix bug relating to carrot.produceSummary option (gsingers)
|
* SOLR-1692: Fix bug relating to carrot.produceSummary option (gsingers)
|
||||||
|
|
||||||
|
* SOLR-1804: Re-enabled clustering on trunk, updated to latest version of Carrot2. No more LGPL run-time dependencies.
|
||||||
|
This release of C2 also does not have a specific Lucene dependency. (Stanislaw Osinski, gsingers)
|
||||||
|
|
||||||
================== Release 1.4.0 ==================
|
================== Release 1.4.0 ==================
|
||||||
|
|
||||||
Solr Clustering will be released for the first time in Solr 1.4. See http://wiki.apache.org/solr/ClusteringComponent
|
Solr Clustering will be released for the first time in Solr 1.4. See http://wiki.apache.org/solr/ClusteringComponent
|
||||||
|
|
|
@ -0,0 +1,160 @@
|
||||||
|
<?xml version="1.0"?>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project name="solr-clustering" default="build">
|
||||||
|
|
||||||
|
<property name="solr-path" value="../.."/>
|
||||||
|
|
||||||
|
<import file="../../common-build.xml"/>
|
||||||
|
|
||||||
|
<description>
|
||||||
|
Clustering Integraton
|
||||||
|
</description>
|
||||||
|
|
||||||
|
<property name="example.local" value="example"/>
|
||||||
|
|
||||||
|
<path id="common.classpath">
|
||||||
|
<fileset dir="lib"/>
|
||||||
|
<pathelement location="${solr-path}/build/solr"/>
|
||||||
|
<pathelement location="${solr-path}/build/solrj"/>
|
||||||
|
<path refid="lucene.classpath"/>
|
||||||
|
<fileset dir="${solr-path}/lib" includes="*.jar"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<path id="test.classpath">
|
||||||
|
<pathelement path="${dest}/classes"/>
|
||||||
|
<pathelement path="${dest}/test-classes"/>
|
||||||
|
<pathelement path="${java.class.path}"/>
|
||||||
|
<pathelement location="${common-solr.dir}/build/tests"/> <!-- include solr test code -->
|
||||||
|
<pathelement location="${common-solr.dir}/../lucene/build/classes/test" /> <!-- include some lucene test code -->
|
||||||
|
<path refid="common.classpath"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<target name="clean">
|
||||||
|
<delete failonerror="false" dir="${dest}"/>
|
||||||
|
|
||||||
|
<!-- example doesn't create this anymore, but clean it up
|
||||||
|
if it's still there from an old build
|
||||||
|
-->
|
||||||
|
<delete dir="example/lib" />
|
||||||
|
</target>
|
||||||
|
|
||||||
|
|
||||||
|
<target name="init">
|
||||||
|
<mkdir dir="${dest}/classes"/>
|
||||||
|
|
||||||
|
<mkdir dir="${build.javadoc}"/>
|
||||||
|
<subant target="compileTests">
|
||||||
|
<fileset dir="${solr-path}" includes="build.xml"/>
|
||||||
|
</subant>
|
||||||
|
<subant target="make-manifest">
|
||||||
|
<fileset dir="${solr-path}" includes="build.xml"/>
|
||||||
|
</subant>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
|
||||||
|
<target name="compile" depends="init">
|
||||||
|
<solr-javac destdir="${dest}/classes"
|
||||||
|
classpathref="common.classpath">
|
||||||
|
<src path="src/main/java"/>
|
||||||
|
</solr-javac>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="build" depends="compile">
|
||||||
|
<solr-jar destfile="${dest}/${fullnamever}.jar" basedir="${dest}/classes"
|
||||||
|
manifest="../../${dest}/META-INF/MANIFEST.MF"/>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="compileTests" depends="compile">
|
||||||
|
<solr-javac destdir="${dest}/test-classes"
|
||||||
|
classpathref="test.classpath">
|
||||||
|
<src path="src/test/java"/>
|
||||||
|
</solr-javac>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="example" depends="build,dist">
|
||||||
|
<!-- this task use to copy lib's but that's no longer needed because
|
||||||
|
../lib and ../lib/downloads are now included explicitly by
|
||||||
|
example/conf/solrconfig.xml
|
||||||
|
-->
|
||||||
|
</target>
|
||||||
|
|
||||||
|
|
||||||
|
<target name="test" depends="compileTests">
|
||||||
|
<mkdir dir="${junit.output.dir}"/>
|
||||||
|
|
||||||
|
<junit printsummary="on"
|
||||||
|
haltonfailure="no"
|
||||||
|
maxmemory="512M"
|
||||||
|
errorProperty="tests.failed"
|
||||||
|
failureProperty="tests.failed"
|
||||||
|
dir="src/test/resources/"
|
||||||
|
tempdir="${junit.output.dir}"
|
||||||
|
>
|
||||||
|
<formatter type="brief" usefile="false" if="junit.details"/>
|
||||||
|
<classpath refid="test.classpath"/>
|
||||||
|
<assertions>
|
||||||
|
<enable package="org.apache.lucene"/>
|
||||||
|
<enable package="org.apache.solr"/>
|
||||||
|
</assertions>
|
||||||
|
<formatter type="xml"/>
|
||||||
|
<batchtest fork="yes" todir="${junit.output.dir}" unless="testcase">
|
||||||
|
<fileset dir="src/test/java" includes="${junit.includes}">
|
||||||
|
<exclude name="**/AbstractClusteringTest*"/>
|
||||||
|
</fileset>
|
||||||
|
</batchtest>
|
||||||
|
<batchtest fork="yes" todir="${junit.output.dir}" if="testcase">
|
||||||
|
<fileset dir="src/test/java" includes="**/${testcase}.java"/>
|
||||||
|
</batchtest>
|
||||||
|
</junit>
|
||||||
|
|
||||||
|
<fail if="tests.failed">Tests failed!</fail>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="dist" depends="build">
|
||||||
|
<!--
|
||||||
|
<copy file="${dest}/${fullnamever}.jar" todir="${solr-path}/build/web/WEB-INF/lib"/>
|
||||||
|
<copy todir="${solr-path}/build/web/WEB-INF/lib" flatten="true">
|
||||||
|
<fileset dir="lib">
|
||||||
|
<include name="**/*.jar"/>
|
||||||
|
</fileset>
|
||||||
|
</copy>
|
||||||
|
-->
|
||||||
|
<copy file="${dest}/${fullnamever}.jar" todir="${solr-path}/dist"/>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="javadoc">
|
||||||
|
<sequential>
|
||||||
|
<mkdir dir="${build.javadoc}/contrib-${name}"/>
|
||||||
|
|
||||||
|
<path id="javadoc.classpath">
|
||||||
|
<path refid="common.classpath"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<invoke-javadoc
|
||||||
|
destdir="${build.javadoc}/contrib-${name}"
|
||||||
|
title="${Name} ${version} contrib-${fullnamever} API">
|
||||||
|
<sources>
|
||||||
|
<packageset dir="src/main/java"/>
|
||||||
|
</sources>
|
||||||
|
</invoke-javadoc>
|
||||||
|
</sequential>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
</project>
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[96c3bdbdaacd5289b0e654842e435689fbcf22e2] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
||||||
AnyObjectId[5ca86c5e72b2953feb0b58fbd87f76d0301cbbf6] was removed in git history.
|
|
||||||
Apache SVN contains full history.
|
|
|
@ -1,2 +0,0 @@
|
||||||
AnyObjectId[85a0ab428be7c8913c120aa932a3d78f705fa73a] was removed in git history.
|
|
||||||
Apache SVN contains full history.
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[05c00b3fbfe234cd33477291432af9d172f13e15] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
||||||
AnyObjectId[f6f425d4a0c127d5249d939b7a93b1250d454cdd] was removed in git history.
|
|
||||||
Apache SVN contains full history.
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[24107e68fedb0ea04291fa769cf992fc53608c60] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
||||||
AnyObjectId[c1652907ebda1e69895d85730f4fc83e1160306e] was removed in git history.
|
|
||||||
Apache SVN contains full history.
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[ebf9b5ef21631c4c76e21ae6fc35b0a9f7336d2c] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[06481add9c7ebce50b75f14d469566906bc0d5a3] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[374e19d07c7f073dc9c84f4dfada8c944389efc0] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[f668bc86b1d42264758b54a0395e49eb564dfd27] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -58,7 +58,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||||
/**
|
/**
|
||||||
* Carrot2 controller that manages instances of clustering algorithms
|
* Carrot2 controller that manages instances of clustering algorithms
|
||||||
*/
|
*/
|
||||||
private CachingController controller = new CachingController();
|
private Controller controller = ControllerFactory.createPooling();
|
||||||
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
|
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
|
||||||
|
|
||||||
private String idFieldName;
|
private String idFieldName;
|
||||||
|
@ -91,6 +91,12 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||||
// Initialize Carrot2 controller. Pass initialization attributes, if any.
|
// Initialize Carrot2 controller. Pass initialization attributes, if any.
|
||||||
HashMap<String, Object> initAttributes = new HashMap<String, Object>();
|
HashMap<String, Object> initAttributes = new HashMap<String, Object>();
|
||||||
extractCarrotAttributes(initParams, initAttributes);
|
extractCarrotAttributes(initParams, initAttributes);
|
||||||
|
|
||||||
|
// Customize the language model factory. The implementation we provide here
|
||||||
|
// is included in the code base of Solr, so that it's possible to refactor
|
||||||
|
// the Lucene APIs the factory relies on if needed.
|
||||||
|
initAttributes.put("PreprocessingPipeline.languageModelFactory",
|
||||||
|
new LuceneLanguageModelFactory());
|
||||||
this.controller.init(initAttributes);
|
this.controller.init(initAttributes);
|
||||||
|
|
||||||
this.idFieldName = core.getSchema().getUniqueKeyField().getName();
|
this.idFieldName = core.getSchema().getUniqueKeyField().getName();
|
||||||
|
|
|
@ -0,0 +1,353 @@
|
||||||
|
package org.apache.solr.handler.clustering.carrot2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.nio.CharBuffer;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.ar.ArabicNormalizer;
|
||||||
|
import org.apache.lucene.analysis.ar.ArabicStemmer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.carrot2.core.LanguageCode;
|
||||||
|
import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
|
||||||
|
import org.carrot2.text.analysis.ITokenizer;
|
||||||
|
import org.carrot2.text.linguistic.BaseLanguageModelFactory;
|
||||||
|
import org.carrot2.text.linguistic.IStemmer;
|
||||||
|
import org.carrot2.text.linguistic.IdentityStemmer;
|
||||||
|
import org.carrot2.text.util.MutableCharArray;
|
||||||
|
import org.carrot2.util.ExceptionUtils;
|
||||||
|
import org.carrot2.util.ReflectionUtils;
|
||||||
|
import org.carrot2.util.attribute.Bindable;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.tartarus.snowball.SnowballProgram;
|
||||||
|
import org.tartarus.snowball.ext.DanishStemmer;
|
||||||
|
import org.tartarus.snowball.ext.DutchStemmer;
|
||||||
|
import org.tartarus.snowball.ext.EnglishStemmer;
|
||||||
|
import org.tartarus.snowball.ext.FinnishStemmer;
|
||||||
|
import org.tartarus.snowball.ext.FrenchStemmer;
|
||||||
|
import org.tartarus.snowball.ext.GermanStemmer;
|
||||||
|
import org.tartarus.snowball.ext.HungarianStemmer;
|
||||||
|
import org.tartarus.snowball.ext.ItalianStemmer;
|
||||||
|
import org.tartarus.snowball.ext.NorwegianStemmer;
|
||||||
|
import org.tartarus.snowball.ext.PortugueseStemmer;
|
||||||
|
import org.tartarus.snowball.ext.RomanianStemmer;
|
||||||
|
import org.tartarus.snowball.ext.RussianStemmer;
|
||||||
|
import org.tartarus.snowball.ext.SpanishStemmer;
|
||||||
|
import org.tartarus.snowball.ext.SwedishStemmer;
|
||||||
|
import org.tartarus.snowball.ext.TurkishStemmer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A Solr-specific language model factory for Carrot2. This factory is the only
|
||||||
|
* element in Carrot2 that depends on Lucene APIs, so should the APIs need to
|
||||||
|
* change, the changes can be made in this class.
|
||||||
|
*/
|
||||||
|
@Bindable(prefix = "DefaultLanguageModelFactory")
|
||||||
|
public class LuceneLanguageModelFactory extends BaseLanguageModelFactory {
|
||||||
|
final static Logger logger = org.slf4j.LoggerFactory
|
||||||
|
.getLogger(LuceneLanguageModelFactory.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provide an {@link IStemmer} implementation for a given language.
|
||||||
|
*/
|
||||||
|
protected IStemmer createStemmer(LanguageCode language) {
|
||||||
|
switch (language) {
|
||||||
|
case ARABIC:
|
||||||
|
return ArabicStemmerFactory.createStemmer();
|
||||||
|
|
||||||
|
case CHINESE_SIMPLIFIED:
|
||||||
|
return IdentityStemmer.INSTANCE;
|
||||||
|
|
||||||
|
default:
|
||||||
|
/*
|
||||||
|
* For other languages, try to use snowball's stemming.
|
||||||
|
*/
|
||||||
|
return SnowballStemmerFactory.createStemmer(language);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected ITokenizer createTokenizer(LanguageCode language) {
|
||||||
|
switch (language) {
|
||||||
|
case CHINESE_SIMPLIFIED:
|
||||||
|
return ChineseTokenizerFactory.createTokenizer();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We use our own analyzer for Arabic. Lucene's version has special
|
||||||
|
* support for Nonspacing-Mark characters (see
|
||||||
|
* http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
|
||||||
|
* have them included as letters in the parser.
|
||||||
|
*/
|
||||||
|
case ARABIC:
|
||||||
|
// Intentional fall-through.
|
||||||
|
|
||||||
|
default:
|
||||||
|
return new ExtendedWhitespaceTokenizer();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory of {@link IStemmer} implementations from the <code>snowball</code>
|
||||||
|
* project.
|
||||||
|
*/
|
||||||
|
private final static class SnowballStemmerFactory {
|
||||||
|
/**
|
||||||
|
* Static hard mapping from language codes to stemmer classes in Snowball.
|
||||||
|
* This mapping is not dynamic because we want to keep the possibility to
|
||||||
|
* obfuscate these classes.
|
||||||
|
*/
|
||||||
|
private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
|
||||||
|
static {
|
||||||
|
snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
|
||||||
|
snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
|
||||||
|
snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
|
||||||
|
snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
|
||||||
|
snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
|
||||||
|
snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
|
||||||
|
snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
|
||||||
|
snowballStemmerClasses
|
||||||
|
.put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
|
||||||
|
snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
|
||||||
|
snowballStemmerClasses
|
||||||
|
.put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
|
||||||
|
snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
|
||||||
|
PortugueseStemmer.class);
|
||||||
|
snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
|
||||||
|
snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
|
||||||
|
snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
|
||||||
|
snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
|
||||||
|
snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An adapter converting Snowball programs into {@link IStemmer} interface.
|
||||||
|
*/
|
||||||
|
private static class SnowballStemmerAdapter implements IStemmer {
|
||||||
|
private final SnowballProgram snowballStemmer;
|
||||||
|
|
||||||
|
public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
|
||||||
|
this.snowballStemmer = snowballStemmer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CharSequence stem(CharSequence word) {
|
||||||
|
snowballStemmer.setCurrent(word.toString());
|
||||||
|
if (snowballStemmer.stem()) {
|
||||||
|
return snowballStemmer.getCurrent();
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create and return an {@link IStemmer} adapter for a
|
||||||
|
* {@link SnowballProgram} for a given language code. An identity stemmer is
|
||||||
|
* returned for unknown languages.
|
||||||
|
*/
|
||||||
|
public static IStemmer createStemmer(LanguageCode language) {
|
||||||
|
final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
|
||||||
|
.get(language);
|
||||||
|
|
||||||
|
if (stemmerClazz == null) {
|
||||||
|
logger.warn("No Snowball stemmer class for: " + language.name()
|
||||||
|
+ ". Quality of clustering may be degraded.");
|
||||||
|
return IdentityStemmer.INSTANCE;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return new SnowballStemmerAdapter(stemmerClazz.newInstance());
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.warn("Could not instantiate snowball stemmer"
|
||||||
|
+ " for language: " + language.name()
|
||||||
|
+ ". Quality of clustering may be degraded.", e);
|
||||||
|
|
||||||
|
return IdentityStemmer.INSTANCE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory of {@link IStemmer} implementations for the
|
||||||
|
* {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
|
||||||
|
* to be present in classpath, otherwise an empty (identity) stemmer is
|
||||||
|
* returned.
|
||||||
|
*/
|
||||||
|
private static class ArabicStemmerFactory {
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
|
||||||
|
ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
|
||||||
|
} catch (ClassNotFoundException e) {
|
||||||
|
logger
|
||||||
|
.warn(
|
||||||
|
"Could not instantiate Lucene stemmer for Arabic, clustering quality "
|
||||||
|
+ "of Chinese content may be degraded. For best quality clusters, "
|
||||||
|
+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
|
||||||
|
e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adapter to lucene-contrib Arabic analyzers.
|
||||||
|
*/
|
||||||
|
private static class LuceneStemmerAdapter implements IStemmer {
|
||||||
|
private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
|
||||||
|
private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
|
||||||
|
|
||||||
|
private char[] buffer = new char[0];
|
||||||
|
|
||||||
|
private LuceneStemmerAdapter() throws Exception {
|
||||||
|
delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
|
||||||
|
normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
|
||||||
|
}
|
||||||
|
|
||||||
|
public CharSequence stem(CharSequence word) {
|
||||||
|
if (word.length() > buffer.length) {
|
||||||
|
buffer = new char[word.length()];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < word.length(); i++) {
|
||||||
|
buffer[i] = word.charAt(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
int newLen = normalizer.normalize(buffer, word.length());
|
||||||
|
newLen = delegate.stem(buffer, newLen);
|
||||||
|
|
||||||
|
if (newLen != word.length() || !equals(buffer, newLen, word)) {
|
||||||
|
return CharBuffer.wrap(buffer, 0, newLen);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Same-same.
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean equals(char[] buffer, int len, CharSequence word) {
|
||||||
|
assert len == word.length();
|
||||||
|
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
if (buffer[i] != word.charAt(i))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static IStemmer createStemmer() {
|
||||||
|
try {
|
||||||
|
return new LuceneStemmerAdapter();
|
||||||
|
} catch (Throwable e) {
|
||||||
|
return IdentityStemmer.INSTANCE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
|
||||||
|
* {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
|
||||||
|
* factory will fall back to the default white space tokenizer.
|
||||||
|
*/
|
||||||
|
private static final class ChineseTokenizerFactory {
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
ReflectionUtils.classForName(
|
||||||
|
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
|
||||||
|
ReflectionUtils.classForName(
|
||||||
|
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
|
||||||
|
} catch (Throwable e) {
|
||||||
|
logger
|
||||||
|
.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
|
||||||
|
+ "of Chinese content may be degraded. For best quality clusters, "
|
||||||
|
+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static ITokenizer createTokenizer() {
|
||||||
|
try {
|
||||||
|
return new ChineseTokenizer();
|
||||||
|
} catch (Throwable e) {
|
||||||
|
return new ExtendedWhitespaceTokenizer();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final static class ChineseTokenizer implements ITokenizer {
|
||||||
|
private final static Pattern numeric = Pattern
|
||||||
|
.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
|
||||||
|
|
||||||
|
private Tokenizer sentenceTokenizer;
|
||||||
|
private TokenStream wordTokenFilter;
|
||||||
|
private CharTermAttribute term = null;
|
||||||
|
|
||||||
|
private final MutableCharArray tempCharSequence;
|
||||||
|
private final Class<?> tokenFilterClass;
|
||||||
|
|
||||||
|
private ChineseTokenizer() throws Exception {
|
||||||
|
this.tempCharSequence = new MutableCharArray(new char[0]);
|
||||||
|
|
||||||
|
// As Smart Chinese is not available during compile time,
|
||||||
|
// we need to resort to reflection.
|
||||||
|
final Class<?> tokenizerClass = ReflectionUtils
|
||||||
|
.classForName("org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
|
||||||
|
this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
|
||||||
|
Reader.class).newInstance((Reader) null);
|
||||||
|
this.tokenFilterClass = ReflectionUtils
|
||||||
|
.classForName("org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public short nextToken() throws IOException {
|
||||||
|
final boolean hasNextToken = wordTokenFilter.incrementToken();
|
||||||
|
if (hasNextToken) {
|
||||||
|
short flags = 0;
|
||||||
|
final char[] image = term.buffer();
|
||||||
|
final int length = term.length();
|
||||||
|
tempCharSequence.reset(image, 0, length);
|
||||||
|
if (length == 1 && image[0] == ',') {
|
||||||
|
// ChineseTokenizer seems to convert all punctuation to ','
|
||||||
|
// characters
|
||||||
|
flags = ITokenizer.TT_PUNCTUATION;
|
||||||
|
} else if (numeric.matcher(tempCharSequence).matches()) {
|
||||||
|
flags = ITokenizer.TT_NUMERIC;
|
||||||
|
} else {
|
||||||
|
flags = ITokenizer.TT_TERM;
|
||||||
|
}
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ITokenizer.TT_EOF;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTermBuffer(MutableCharArray array) {
|
||||||
|
array.reset(term.buffer(), 0, term.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset(Reader input) throws IOException {
|
||||||
|
try {
|
||||||
|
sentenceTokenizer.reset(input);
|
||||||
|
wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
|
||||||
|
TokenStream.class).newInstance(sentenceTokenizer);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw ExceptionUtils.wrapAsRuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -46,7 +46,10 @@ import static org.junit.Assert.*;
|
||||||
public class CarrotClusteringEngineTest extends AbstractClusteringTest {
|
public class CarrotClusteringEngineTest extends AbstractClusteringTest {
|
||||||
@Test
|
@Test
|
||||||
public void testCarrotLingo() throws Exception {
|
public void testCarrotLingo() throws Exception {
|
||||||
checkEngine(getClusteringEngine("default"), 10);
|
// Note: the expected number of clusters may change after upgrading Carrot2
|
||||||
|
// due to e.g. internal improvements or tuning of Carrot2 clustering.
|
||||||
|
final int expectedNumClusters = 10;
|
||||||
|
checkEngine(getClusteringEngine("default"), expectedNumClusters);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -54,7 +57,11 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTest {
|
||||||
ModifiableSolrParams solrParams = new ModifiableSolrParams();
|
ModifiableSolrParams solrParams = new ModifiableSolrParams();
|
||||||
solrParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
|
solrParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
|
||||||
solrParams.add(CarrotParams.SUMMARY_FRAGSIZE, "200");//how do we validate this?
|
solrParams.add(CarrotParams.SUMMARY_FRAGSIZE, "200");//how do we validate this?
|
||||||
checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, 15, new TermQuery(new Term("snippet", "mine")), solrParams);
|
|
||||||
|
// Note: the expected number of clusters may change after upgrading Carrot2
|
||||||
|
// due to e.g. internal improvements or tuning of Carrot2 clustering.
|
||||||
|
final int expectedNumClusters = 15;
|
||||||
|
checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, expectedNumClusters, new TermQuery(new Term("snippet", "mine")), solrParams);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -56,7 +56,6 @@
|
||||||
<!-- If a dir option (with or without a regex) is used and nothing is found
|
<!-- If a dir option (with or without a regex) is used and nothing is found
|
||||||
that matches, it will be ignored
|
that matches, it will be ignored
|
||||||
-->
|
-->
|
||||||
<lib dir="../../contrib/clustering/lib/downloads/" />
|
|
||||||
<lib dir="../../contrib/clustering/lib/" />
|
<lib dir="../../contrib/clustering/lib/" />
|
||||||
<lib dir="/total/crap/dir/ignored" />
|
<lib dir="/total/crap/dir/ignored" />
|
||||||
<!-- an exact path can be used to specify a specific file. This will cause
|
<!-- an exact path can be used to specify a specific file. This will cause
|
||||||
|
@ -808,6 +807,12 @@
|
||||||
parameter name and attribute value as parameter value.
|
parameter name and attribute value as parameter value.
|
||||||
-->
|
-->
|
||||||
<str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
|
<str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
The language to assume for the documents. For a list of allowed values, see:
|
||||||
|
http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
|
||||||
|
-->
|
||||||
|
<str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
|
||||||
</lst>
|
</lst>
|
||||||
<lst name="engine">
|
<lst name="engine">
|
||||||
<str name="name">stc</str>
|
<str name="name">stc</str>
|
||||||
|
|
|
@ -1,2 +0,0 @@
|
||||||
AnyObjectId[a7abdbbdb4efdd6b3bf14075f83af01da0841487] was removed in git history.
|
|
||||||
Apache SVN contains full history.
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[3aea56f8076551e8a5b631d8cc3d40190089f0a8] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
Loading…
Reference in New Issue