mirror of https://github.com/apache/lucene.git
SOLR-1804: Re-integrated Carrot2
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@988129 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ad8bfb3bc4
commit
9b62d49e92
|
@ -234,7 +234,7 @@ New Features
|
|||
|
||||
* SOLR-1682: (SOLR-236, SOLR-237, SOLR-1773, SOLR-1311) Search grouping / Field collapsing.
|
||||
(Martijn van Groningen, Emmanuel Keller, Shalin Shekhar Mangar,
|
||||
Koji Sekiguchi, Iván de Prado, Ryan McKinley, Marc Sturlese, Peter Karich,
|
||||
Koji Sekiguchi, Iv<EFBFBD>n de Prado, Ryan McKinley, Marc Sturlese, Peter Karich,
|
||||
Bojan Smid, Charles Hornberger, Dieter Grad, Dmitry Lihachev, Doug Steigerwald,
|
||||
Karsten Sperling, Michael Gundlach, Oleg Gnatovskiy, Thomas Traeger, yonik)
|
||||
|
||||
|
@ -521,6 +521,8 @@ Other Changes
|
|||
* SOLR-2003: SolrResourceLoader will report any encoding errors, rather than
|
||||
silently using replacement characters for invalid inputs (blargy via rmuir)
|
||||
|
||||
* SOLR-1804: Google collections updated to Google Guava (which is a superset of collections and contains bug fixes) (gsingers)
|
||||
|
||||
Build
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -778,26 +778,9 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
==========================================================================
|
||||
EHCache
|
||||
/**
|
||||
* Copyright 2003-2008 Luck Consulting Pty Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
==========================================================================
|
||||
Google Collections
|
||||
Guava
|
||||
/**
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
|
|
@ -148,7 +148,7 @@ Copyright (c) 2000-2005 INRIA, France Telecom
|
|||
=========================================================================
|
||||
== Carrot2 Notice ==
|
||||
=========================================================================
|
||||
Copyright (C) 2002-2008, Dawid Weiss, Stanislaw Osinski.
|
||||
Copyright (C) 2002-2010, Dawid Weiss, Stanislaw Osinski.
|
||||
Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file.
|
||||
All rights reserved.
|
||||
|
||||
|
@ -156,24 +156,16 @@ This product includes software developed by the Carrot2 Project.
|
|||
|
||||
See http://project.carrot2.org/
|
||||
|
||||
=========================================================================
|
||||
== EHCache Notice ==
|
||||
=========================================================================
|
||||
Copyright 2003-2008 Luck Consulting Pty Ltd
|
||||
|
||||
This product includes software developed by the EHCache Project
|
||||
|
||||
See ????
|
||||
|
||||
=========================================================================
|
||||
== Google Collections Notice ==
|
||||
== Guava Notice ==
|
||||
=========================================================================
|
||||
|
||||
Copyright ???? Google, Inc.
|
||||
|
||||
This product includes software developed by the Google Collections project.
|
||||
This product includes software developed by the Google Guava project.
|
||||
|
||||
See ????
|
||||
See http://code.google.com/p/guava-libraries/
|
||||
|
||||
=========================================================================
|
||||
== Jackson Notice ==
|
||||
|
@ -182,7 +174,7 @@ Copyright ????
|
|||
|
||||
This product includes software developed by the Jackson project.
|
||||
|
||||
See ????
|
||||
See http://jackson.codehaus.org/
|
||||
|
||||
=========================================================================
|
||||
== HSQLDB Notice ==
|
||||
|
|
|
@ -8,12 +8,15 @@ CHANGES
|
|||
|
||||
$Id:$
|
||||
|
||||
================== Release 1.5-dev ==================
|
||||
================== Release XXXX ==================
|
||||
|
||||
* SOLR-1684: Switch to use the SolrIndexSearcher.doc(int, Set<String>) method b/c it can use the document cache (gsingers)
|
||||
|
||||
* SOLR-1692: Fix bug relating to carrot.produceSummary option (gsingers)
|
||||
|
||||
* SOLR-1804: Re-enabled clustering on trunk, updated to latest version of Carrot2. No more LGPL run-time dependencies.
|
||||
This release of C2 also does not have a specific Lucene dependency. (Stanislaw Osinski, gsingers)
|
||||
|
||||
================== Release 1.4.0 ==================
|
||||
|
||||
Solr Clustering will be released for the first time in Solr 1.4. See http://wiki.apache.org/solr/ClusteringComponent
|
||||
|
|
|
@ -0,0 +1,160 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="solr-clustering" default="build">
|
||||
|
||||
<property name="solr-path" value="../.."/>
|
||||
|
||||
<import file="../../common-build.xml"/>
|
||||
|
||||
<description>
|
||||
Clustering Integraton
|
||||
</description>
|
||||
|
||||
<property name="example.local" value="example"/>
|
||||
|
||||
<path id="common.classpath">
|
||||
<fileset dir="lib"/>
|
||||
<pathelement location="${solr-path}/build/solr"/>
|
||||
<pathelement location="${solr-path}/build/solrj"/>
|
||||
<path refid="lucene.classpath"/>
|
||||
<fileset dir="${solr-path}/lib" includes="*.jar"/>
|
||||
</path>
|
||||
|
||||
<path id="test.classpath">
|
||||
<pathelement path="${dest}/classes"/>
|
||||
<pathelement path="${dest}/test-classes"/>
|
||||
<pathelement path="${java.class.path}"/>
|
||||
<pathelement location="${common-solr.dir}/build/tests"/> <!-- include solr test code -->
|
||||
<pathelement location="${common-solr.dir}/../lucene/build/classes/test" /> <!-- include some lucene test code -->
|
||||
<path refid="common.classpath"/>
|
||||
</path>
|
||||
|
||||
<target name="clean">
|
||||
<delete failonerror="false" dir="${dest}"/>
|
||||
|
||||
<!-- example doesn't create this anymore, but clean it up
|
||||
if it's still there from an old build
|
||||
-->
|
||||
<delete dir="example/lib" />
|
||||
</target>
|
||||
|
||||
|
||||
<target name="init">
|
||||
<mkdir dir="${dest}/classes"/>
|
||||
|
||||
<mkdir dir="${build.javadoc}"/>
|
||||
<subant target="compileTests">
|
||||
<fileset dir="${solr-path}" includes="build.xml"/>
|
||||
</subant>
|
||||
<subant target="make-manifest">
|
||||
<fileset dir="${solr-path}" includes="build.xml"/>
|
||||
</subant>
|
||||
</target>
|
||||
|
||||
|
||||
<target name="compile" depends="init">
|
||||
<solr-javac destdir="${dest}/classes"
|
||||
classpathref="common.classpath">
|
||||
<src path="src/main/java"/>
|
||||
</solr-javac>
|
||||
</target>
|
||||
|
||||
<target name="build" depends="compile">
|
||||
<solr-jar destfile="${dest}/${fullnamever}.jar" basedir="${dest}/classes"
|
||||
manifest="../../${dest}/META-INF/MANIFEST.MF"/>
|
||||
</target>
|
||||
|
||||
<target name="compileTests" depends="compile">
|
||||
<solr-javac destdir="${dest}/test-classes"
|
||||
classpathref="test.classpath">
|
||||
<src path="src/test/java"/>
|
||||
</solr-javac>
|
||||
</target>
|
||||
|
||||
<target name="example" depends="build,dist">
|
||||
<!-- this task use to copy lib's but that's no longer needed because
|
||||
../lib and ../lib/downloads are now included explicitly by
|
||||
example/conf/solrconfig.xml
|
||||
-->
|
||||
</target>
|
||||
|
||||
|
||||
<target name="test" depends="compileTests">
|
||||
<mkdir dir="${junit.output.dir}"/>
|
||||
|
||||
<junit printsummary="on"
|
||||
haltonfailure="no"
|
||||
maxmemory="512M"
|
||||
errorProperty="tests.failed"
|
||||
failureProperty="tests.failed"
|
||||
dir="src/test/resources/"
|
||||
tempdir="${junit.output.dir}"
|
||||
>
|
||||
<formatter type="brief" usefile="false" if="junit.details"/>
|
||||
<classpath refid="test.classpath"/>
|
||||
<assertions>
|
||||
<enable package="org.apache.lucene"/>
|
||||
<enable package="org.apache.solr"/>
|
||||
</assertions>
|
||||
<formatter type="xml"/>
|
||||
<batchtest fork="yes" todir="${junit.output.dir}" unless="testcase">
|
||||
<fileset dir="src/test/java" includes="${junit.includes}">
|
||||
<exclude name="**/AbstractClusteringTest*"/>
|
||||
</fileset>
|
||||
</batchtest>
|
||||
<batchtest fork="yes" todir="${junit.output.dir}" if="testcase">
|
||||
<fileset dir="src/test/java" includes="**/${testcase}.java"/>
|
||||
</batchtest>
|
||||
</junit>
|
||||
|
||||
<fail if="tests.failed">Tests failed!</fail>
|
||||
</target>
|
||||
|
||||
<target name="dist" depends="build">
|
||||
<!--
|
||||
<copy file="${dest}/${fullnamever}.jar" todir="${solr-path}/build/web/WEB-INF/lib"/>
|
||||
<copy todir="${solr-path}/build/web/WEB-INF/lib" flatten="true">
|
||||
<fileset dir="lib">
|
||||
<include name="**/*.jar"/>
|
||||
</fileset>
|
||||
</copy>
|
||||
-->
|
||||
<copy file="${dest}/${fullnamever}.jar" todir="${solr-path}/dist"/>
|
||||
</target>
|
||||
|
||||
<target name="javadoc">
|
||||
<sequential>
|
||||
<mkdir dir="${build.javadoc}/contrib-${name}"/>
|
||||
|
||||
<path id="javadoc.classpath">
|
||||
<path refid="common.classpath"/>
|
||||
</path>
|
||||
|
||||
<invoke-javadoc
|
||||
destdir="${build.javadoc}/contrib-${name}"
|
||||
title="${Name} ${version} contrib-${fullnamever} API">
|
||||
<sources>
|
||||
<packageset dir="src/main/java"/>
|
||||
</sources>
|
||||
</invoke-javadoc>
|
||||
</sequential>
|
||||
</target>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[96c3bdbdaacd5289b0e654842e435689fbcf22e2] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[5ca86c5e72b2953feb0b58fbd87f76d0301cbbf6] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[85a0ab428be7c8913c120aa932a3d78f705fa73a] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[05c00b3fbfe234cd33477291432af9d172f13e15] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[f6f425d4a0c127d5249d939b7a93b1250d454cdd] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[24107e68fedb0ea04291fa769cf992fc53608c60] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[c1652907ebda1e69895d85730f4fc83e1160306e] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[ebf9b5ef21631c4c76e21ae6fc35b0a9f7336d2c] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[06481add9c7ebce50b75f14d469566906bc0d5a3] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[374e19d07c7f073dc9c84f4dfada8c944389efc0] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[f668bc86b1d42264758b54a0395e49eb564dfd27] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -58,7 +58,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
/**
|
||||
* Carrot2 controller that manages instances of clustering algorithms
|
||||
*/
|
||||
private CachingController controller = new CachingController();
|
||||
private Controller controller = ControllerFactory.createPooling();
|
||||
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
|
||||
|
||||
private String idFieldName;
|
||||
|
@ -91,6 +91,12 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
// Initialize Carrot2 controller. Pass initialization attributes, if any.
|
||||
HashMap<String, Object> initAttributes = new HashMap<String, Object>();
|
||||
extractCarrotAttributes(initParams, initAttributes);
|
||||
|
||||
// Customize the language model factory. The implementation we provide here
|
||||
// is included in the code base of Solr, so that it's possible to refactor
|
||||
// the Lucene APIs the factory relies on if needed.
|
||||
initAttributes.put("PreprocessingPipeline.languageModelFactory",
|
||||
new LuceneLanguageModelFactory());
|
||||
this.controller.init(initAttributes);
|
||||
|
||||
this.idFieldName = core.getSchema().getUniqueKeyField().getName();
|
||||
|
|
|
@ -0,0 +1,353 @@
|
|||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.CharBuffer;
|
||||
import java.util.HashMap;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ar.ArabicNormalizer;
|
||||
import org.apache.lucene.analysis.ar.ArabicStemmer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
|
||||
import org.carrot2.text.analysis.ITokenizer;
|
||||
import org.carrot2.text.linguistic.BaseLanguageModelFactory;
|
||||
import org.carrot2.text.linguistic.IStemmer;
|
||||
import org.carrot2.text.linguistic.IdentityStemmer;
|
||||
import org.carrot2.text.util.MutableCharArray;
|
||||
import org.carrot2.util.ExceptionUtils;
|
||||
import org.carrot2.util.ReflectionUtils;
|
||||
import org.carrot2.util.attribute.Bindable;
|
||||
import org.slf4j.Logger;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
import org.tartarus.snowball.ext.DanishStemmer;
|
||||
import org.tartarus.snowball.ext.DutchStemmer;
|
||||
import org.tartarus.snowball.ext.EnglishStemmer;
|
||||
import org.tartarus.snowball.ext.FinnishStemmer;
|
||||
import org.tartarus.snowball.ext.FrenchStemmer;
|
||||
import org.tartarus.snowball.ext.GermanStemmer;
|
||||
import org.tartarus.snowball.ext.HungarianStemmer;
|
||||
import org.tartarus.snowball.ext.ItalianStemmer;
|
||||
import org.tartarus.snowball.ext.NorwegianStemmer;
|
||||
import org.tartarus.snowball.ext.PortugueseStemmer;
|
||||
import org.tartarus.snowball.ext.RomanianStemmer;
|
||||
import org.tartarus.snowball.ext.RussianStemmer;
|
||||
import org.tartarus.snowball.ext.SpanishStemmer;
|
||||
import org.tartarus.snowball.ext.SwedishStemmer;
|
||||
import org.tartarus.snowball.ext.TurkishStemmer;
|
||||
|
||||
/**
|
||||
* A Solr-specific language model factory for Carrot2. This factory is the only
|
||||
* element in Carrot2 that depends on Lucene APIs, so should the APIs need to
|
||||
* change, the changes can be made in this class.
|
||||
*/
|
||||
@Bindable(prefix = "DefaultLanguageModelFactory")
|
||||
public class LuceneLanguageModelFactory extends BaseLanguageModelFactory {
|
||||
final static Logger logger = org.slf4j.LoggerFactory
|
||||
.getLogger(LuceneLanguageModelFactory.class);
|
||||
|
||||
/**
|
||||
* Provide an {@link IStemmer} implementation for a given language.
|
||||
*/
|
||||
protected IStemmer createStemmer(LanguageCode language) {
|
||||
switch (language) {
|
||||
case ARABIC:
|
||||
return ArabicStemmerFactory.createStemmer();
|
||||
|
||||
case CHINESE_SIMPLIFIED:
|
||||
return IdentityStemmer.INSTANCE;
|
||||
|
||||
default:
|
||||
/*
|
||||
* For other languages, try to use snowball's stemming.
|
||||
*/
|
||||
return SnowballStemmerFactory.createStemmer(language);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ITokenizer createTokenizer(LanguageCode language) {
|
||||
switch (language) {
|
||||
case CHINESE_SIMPLIFIED:
|
||||
return ChineseTokenizerFactory.createTokenizer();
|
||||
|
||||
/*
|
||||
* We use our own analyzer for Arabic. Lucene's version has special
|
||||
* support for Nonspacing-Mark characters (see
|
||||
* http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
|
||||
* have them included as letters in the parser.
|
||||
*/
|
||||
case ARABIC:
|
||||
// Intentional fall-through.
|
||||
|
||||
default:
|
||||
return new ExtendedWhitespaceTokenizer();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory of {@link IStemmer} implementations from the <code>snowball</code>
|
||||
* project.
|
||||
*/
|
||||
private final static class SnowballStemmerFactory {
|
||||
/**
|
||||
* Static hard mapping from language codes to stemmer classes in Snowball.
|
||||
* This mapping is not dynamic because we want to keep the possibility to
|
||||
* obfuscate these classes.
|
||||
*/
|
||||
private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
|
||||
static {
|
||||
snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
|
||||
snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
|
||||
snowballStemmerClasses
|
||||
.put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
|
||||
snowballStemmerClasses
|
||||
.put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
|
||||
PortugueseStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* An adapter converting Snowball programs into {@link IStemmer} interface.
|
||||
*/
|
||||
private static class SnowballStemmerAdapter implements IStemmer {
|
||||
private final SnowballProgram snowballStemmer;
|
||||
|
||||
public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
|
||||
this.snowballStemmer = snowballStemmer;
|
||||
}
|
||||
|
||||
public CharSequence stem(CharSequence word) {
|
||||
snowballStemmer.setCurrent(word.toString());
|
||||
if (snowballStemmer.stem()) {
|
||||
return snowballStemmer.getCurrent();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and return an {@link IStemmer} adapter for a
|
||||
* {@link SnowballProgram} for a given language code. An identity stemmer is
|
||||
* returned for unknown languages.
|
||||
*/
|
||||
public static IStemmer createStemmer(LanguageCode language) {
|
||||
final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
|
||||
.get(language);
|
||||
|
||||
if (stemmerClazz == null) {
|
||||
logger.warn("No Snowball stemmer class for: " + language.name()
|
||||
+ ". Quality of clustering may be degraded.");
|
||||
return IdentityStemmer.INSTANCE;
|
||||
}
|
||||
|
||||
try {
|
||||
return new SnowballStemmerAdapter(stemmerClazz.newInstance());
|
||||
} catch (Exception e) {
|
||||
logger.warn("Could not instantiate snowball stemmer"
|
||||
+ " for language: " + language.name()
|
||||
+ ". Quality of clustering may be degraded.", e);
|
||||
|
||||
return IdentityStemmer.INSTANCE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory of {@link IStemmer} implementations for the
|
||||
* {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
|
||||
* to be present in classpath, otherwise an empty (identity) stemmer is
|
||||
* returned.
|
||||
*/
|
||||
private static class ArabicStemmerFactory {
|
||||
static {
|
||||
try {
|
||||
ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
|
||||
ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
|
||||
} catch (ClassNotFoundException e) {
|
||||
logger
|
||||
.warn(
|
||||
"Could not instantiate Lucene stemmer for Arabic, clustering quality "
|
||||
+ "of Chinese content may be degraded. For best quality clusters, "
|
||||
+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
|
||||
e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapter to lucene-contrib Arabic analyzers.
|
||||
*/
|
||||
private static class LuceneStemmerAdapter implements IStemmer {
|
||||
private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
|
||||
private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
|
||||
|
||||
private char[] buffer = new char[0];
|
||||
|
||||
private LuceneStemmerAdapter() throws Exception {
|
||||
delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
|
||||
normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
|
||||
}
|
||||
|
||||
public CharSequence stem(CharSequence word) {
|
||||
if (word.length() > buffer.length) {
|
||||
buffer = new char[word.length()];
|
||||
}
|
||||
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
buffer[i] = word.charAt(i);
|
||||
}
|
||||
|
||||
int newLen = normalizer.normalize(buffer, word.length());
|
||||
newLen = delegate.stem(buffer, newLen);
|
||||
|
||||
if (newLen != word.length() || !equals(buffer, newLen, word)) {
|
||||
return CharBuffer.wrap(buffer, 0, newLen);
|
||||
}
|
||||
|
||||
// Same-same.
|
||||
return null;
|
||||
}
|
||||
|
||||
private boolean equals(char[] buffer, int len, CharSequence word) {
|
||||
assert len == word.length();
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (buffer[i] != word.charAt(i))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public static IStemmer createStemmer() {
|
||||
try {
|
||||
return new LuceneStemmerAdapter();
|
||||
} catch (Throwable e) {
|
||||
return IdentityStemmer.INSTANCE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
|
||||
* {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
|
||||
* factory will fall back to the default white space tokenizer.
|
||||
*/
|
||||
private static final class ChineseTokenizerFactory {
|
||||
static {
|
||||
try {
|
||||
ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
|
||||
ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
|
||||
} catch (Throwable e) {
|
||||
logger
|
||||
.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
|
||||
+ "of Chinese content may be degraded. For best quality clusters, "
|
||||
+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
|
||||
}
|
||||
}
|
||||
|
||||
static ITokenizer createTokenizer() {
|
||||
try {
|
||||
return new ChineseTokenizer();
|
||||
} catch (Throwable e) {
|
||||
return new ExtendedWhitespaceTokenizer();
|
||||
}
|
||||
}
|
||||
|
||||
private final static class ChineseTokenizer implements ITokenizer {
|
||||
private final static Pattern numeric = Pattern
|
||||
.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
|
||||
|
||||
private Tokenizer sentenceTokenizer;
|
||||
private TokenStream wordTokenFilter;
|
||||
private CharTermAttribute term = null;
|
||||
|
||||
private final MutableCharArray tempCharSequence;
|
||||
private final Class<?> tokenFilterClass;
|
||||
|
||||
private ChineseTokenizer() throws Exception {
|
||||
this.tempCharSequence = new MutableCharArray(new char[0]);
|
||||
|
||||
// As Smart Chinese is not available during compile time,
|
||||
// we need to resort to reflection.
|
||||
final Class<?> tokenizerClass = ReflectionUtils
|
||||
.classForName("org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
|
||||
this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
|
||||
Reader.class).newInstance((Reader) null);
|
||||
this.tokenFilterClass = ReflectionUtils
|
||||
.classForName("org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
|
||||
}
|
||||
|
||||
public short nextToken() throws IOException {
|
||||
final boolean hasNextToken = wordTokenFilter.incrementToken();
|
||||
if (hasNextToken) {
|
||||
short flags = 0;
|
||||
final char[] image = term.buffer();
|
||||
final int length = term.length();
|
||||
tempCharSequence.reset(image, 0, length);
|
||||
if (length == 1 && image[0] == ',') {
|
||||
// ChineseTokenizer seems to convert all punctuation to ','
|
||||
// characters
|
||||
flags = ITokenizer.TT_PUNCTUATION;
|
||||
} else if (numeric.matcher(tempCharSequence).matches()) {
|
||||
flags = ITokenizer.TT_NUMERIC;
|
||||
} else {
|
||||
flags = ITokenizer.TT_TERM;
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
|
||||
return ITokenizer.TT_EOF;
|
||||
}
|
||||
|
||||
public void setTermBuffer(MutableCharArray array) {
|
||||
array.reset(term.buffer(), 0, term.length());
|
||||
}
|
||||
|
||||
public void reset(Reader input) throws IOException {
|
||||
try {
|
||||
sentenceTokenizer.reset(input);
|
||||
wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
|
||||
TokenStream.class).newInstance(sentenceTokenizer);
|
||||
} catch (Exception e) {
|
||||
throw ExceptionUtils.wrapAsRuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -46,7 +46,10 @@ import static org.junit.Assert.*;
|
|||
public class CarrotClusteringEngineTest extends AbstractClusteringTest {
|
||||
@Test
|
||||
public void testCarrotLingo() throws Exception {
|
||||
checkEngine(getClusteringEngine("default"), 10);
|
||||
// Note: the expected number of clusters may change after upgrading Carrot2
|
||||
// due to e.g. internal improvements or tuning of Carrot2 clustering.
|
||||
final int expectedNumClusters = 10;
|
||||
checkEngine(getClusteringEngine("default"), expectedNumClusters);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -54,7 +57,11 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTest {
|
|||
ModifiableSolrParams solrParams = new ModifiableSolrParams();
|
||||
solrParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
|
||||
solrParams.add(CarrotParams.SUMMARY_FRAGSIZE, "200");//how do we validate this?
|
||||
checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, 15, new TermQuery(new Term("snippet", "mine")), solrParams);
|
||||
|
||||
// Note: the expected number of clusters may change after upgrading Carrot2
|
||||
// due to e.g. internal improvements or tuning of Carrot2 clustering.
|
||||
final int expectedNumClusters = 15;
|
||||
checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, expectedNumClusters, new TermQuery(new Term("snippet", "mine")), solrParams);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -56,7 +56,6 @@
|
|||
<!-- If a dir option (with or without a regex) is used and nothing is found
|
||||
that matches, it will be ignored
|
||||
-->
|
||||
<lib dir="../../contrib/clustering/lib/downloads/" />
|
||||
<lib dir="../../contrib/clustering/lib/" />
|
||||
<lib dir="/total/crap/dir/ignored" />
|
||||
<!-- an exact path can be used to specify a specific file. This will cause
|
||||
|
@ -808,6 +807,12 @@
|
|||
parameter name and attribute value as parameter value.
|
||||
-->
|
||||
<str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
|
||||
|
||||
<!--
|
||||
The language to assume for the documents. For a list of allowed values, see:
|
||||
http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
|
||||
-->
|
||||
<str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="name">stc</str>
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[a7abdbbdb4efdd6b3bf14075f83af01da0841487] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[3aea56f8076551e8a5b631d8cc3d40190089f0a8] was removed in git history.
|
||||
Apache SVN contains full history.
|
Loading…
Reference in New Issue