From 9b62d49e924eb73afd6e0ca9054d29d8b394f54f Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Mon, 23 Aug 2010 14:24:00 +0000 Subject: [PATCH] SOLR-1804: Re-integrated Carrot2 git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@988129 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 4 +- solr/LICENSE.txt | 19 +- solr/NOTICE.txt | 18 +- solr/contrib/clustering/CHANGES.txt | 5 +- solr/contrib/clustering/build.xml | 160 ++++++++ .../clustering/lib/carrot2-core-3.4.0.jar | 2 + .../clustering/lib/carrot2-mini-3.1.0.jar | 2 - solr/contrib/clustering/lib/ehcache-1.6.2.jar | 2 - solr/contrib/clustering/lib/hppc-0.3.1.jar | 2 + .../lib/jackson-core-asl-0.9.9-6.jar | 2 - .../clustering/lib/jackson-core-asl-1.5.2.jar | 2 + .../lib/jackson-mapper-asl-0.9.9-6.jar | 2 - .../lib/jackson-mapper-asl-1.5.2.jar | 2 + .../clustering/lib/mahout-collections-0.3.jar | 2 + .../clustering/lib/mahout-math-0.3.jar | 2 + .../clustering/lib/simple-xml-2.3.5.jar | 2 + .../carrot2/CarrotClusteringEngine.java | 8 +- .../carrot2/LuceneLanguageModelFactory.java | 353 ++++++++++++++++++ .../carrot2/CarrotClusteringEngineTest.java | 11 +- solr/example/solr/conf/solrconfig.xml | 7 +- solr/lib/google-collect-1.0.jar | 2 - solr/lib/guava-r05.jar | 2 + 22 files changed, 564 insertions(+), 47 deletions(-) create mode 100644 solr/contrib/clustering/build.xml create mode 100644 solr/contrib/clustering/lib/carrot2-core-3.4.0.jar delete mode 100644 solr/contrib/clustering/lib/carrot2-mini-3.1.0.jar delete mode 100644 solr/contrib/clustering/lib/ehcache-1.6.2.jar create mode 100644 solr/contrib/clustering/lib/hppc-0.3.1.jar delete mode 100644 solr/contrib/clustering/lib/jackson-core-asl-0.9.9-6.jar create mode 100644 solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar delete mode 100644 solr/contrib/clustering/lib/jackson-mapper-asl-0.9.9-6.jar create mode 100644 solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar create mode 100644 solr/contrib/clustering/lib/mahout-collections-0.3.jar create mode 100644 solr/contrib/clustering/lib/mahout-math-0.3.jar create mode 100644 solr/contrib/clustering/lib/simple-xml-2.3.5.jar create mode 100644 solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java delete mode 100644 solr/lib/google-collect-1.0.jar create mode 100644 solr/lib/guava-r05.jar diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index e7ef40949ed..57909cbb40b 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -234,7 +234,7 @@ New Features * SOLR-1682: (SOLR-236, SOLR-237, SOLR-1773, SOLR-1311) Search grouping / Field collapsing. (Martijn van Groningen, Emmanuel Keller, Shalin Shekhar Mangar, - Koji Sekiguchi, Iván de Prado, Ryan McKinley, Marc Sturlese, Peter Karich, + Koji Sekiguchi, Iv�n de Prado, Ryan McKinley, Marc Sturlese, Peter Karich, Bojan Smid, Charles Hornberger, Dieter Grad, Dmitry Lihachev, Doug Steigerwald, Karsten Sperling, Michael Gundlach, Oleg Gnatovskiy, Thomas Traeger, yonik) @@ -521,6 +521,8 @@ Other Changes * SOLR-2003: SolrResourceLoader will report any encoding errors, rather than silently using replacement characters for invalid inputs (blargy via rmuir) +* SOLR-1804: Google collections updated to Google Guava (which is a superset of collections and contains bug fixes) (gsingers) + Build ---------------------- diff --git a/solr/LICENSE.txt b/solr/LICENSE.txt index 06520adb501..5bf358cecd1 100644 --- a/solr/LICENSE.txt +++ b/solr/LICENSE.txt @@ -778,26 +778,9 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -========================================================================== -EHCache -/** - * Copyright 2003-2008 Luck Consulting Pty Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ ========================================================================== -Google Collections +Guava /** * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/solr/NOTICE.txt b/solr/NOTICE.txt index be0d24f8380..39e8d4fdc47 100644 --- a/solr/NOTICE.txt +++ b/solr/NOTICE.txt @@ -148,7 +148,7 @@ Copyright (c) 2000-2005 INRIA, France Telecom ========================================================================= == Carrot2 Notice == ========================================================================= -Copyright (C) 2002-2008, Dawid Weiss, Stanislaw Osinski. +Copyright (C) 2002-2010, Dawid Weiss, Stanislaw Osinski. Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file. All rights reserved. @@ -156,24 +156,16 @@ This product includes software developed by the Carrot2 Project. See http://project.carrot2.org/ -========================================================================= -== EHCache Notice == -========================================================================= -Copyright 2003-2008 Luck Consulting Pty Ltd - -This product includes software developed by the EHCache Project - -See ???? ========================================================================= -== Google Collections Notice == +== Guava Notice == ========================================================================= Copyright ???? Google, Inc. -This product includes software developed by the Google Collections project. +This product includes software developed by the Google Guava project. -See ???? +See http://code.google.com/p/guava-libraries/ ========================================================================= == Jackson Notice == @@ -182,7 +174,7 @@ Copyright ???? This product includes software developed by the Jackson project. -See ???? +See http://jackson.codehaus.org/ ========================================================================= == HSQLDB Notice == diff --git a/solr/contrib/clustering/CHANGES.txt b/solr/contrib/clustering/CHANGES.txt index 93ba1cc86a0..4e1c3767568 100644 --- a/solr/contrib/clustering/CHANGES.txt +++ b/solr/contrib/clustering/CHANGES.txt @@ -8,12 +8,15 @@ CHANGES $Id:$ -================== Release 1.5-dev ================== +================== Release XXXX ================== * SOLR-1684: Switch to use the SolrIndexSearcher.doc(int, Set) method b/c it can use the document cache (gsingers) * SOLR-1692: Fix bug relating to carrot.produceSummary option (gsingers) +* SOLR-1804: Re-enabled clustering on trunk, updated to latest version of Carrot2. No more LGPL run-time dependencies. + This release of C2 also does not have a specific Lucene dependency. (Stanislaw Osinski, gsingers) + ================== Release 1.4.0 ================== Solr Clustering will be released for the first time in Solr 1.4. See http://wiki.apache.org/solr/ClusteringComponent diff --git a/solr/contrib/clustering/build.xml b/solr/contrib/clustering/build.xml new file mode 100644 index 00000000000..1f6466f7519 --- /dev/null +++ b/solr/contrib/clustering/build.xml @@ -0,0 +1,160 @@ + + + + + + + + + + + + Clustering Integraton + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tests failed! + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/solr/contrib/clustering/lib/carrot2-core-3.4.0.jar b/solr/contrib/clustering/lib/carrot2-core-3.4.0.jar new file mode 100644 index 00000000000..a09b28ac477 --- /dev/null +++ b/solr/contrib/clustering/lib/carrot2-core-3.4.0.jar @@ -0,0 +1,2 @@ +AnyObjectId[96c3bdbdaacd5289b0e654842e435689fbcf22e2] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/clustering/lib/carrot2-mini-3.1.0.jar b/solr/contrib/clustering/lib/carrot2-mini-3.1.0.jar deleted file mode 100644 index 34cc9bc1e09..00000000000 --- a/solr/contrib/clustering/lib/carrot2-mini-3.1.0.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[5ca86c5e72b2953feb0b58fbd87f76d0301cbbf6] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/clustering/lib/ehcache-1.6.2.jar b/solr/contrib/clustering/lib/ehcache-1.6.2.jar deleted file mode 100644 index 37d60601738..00000000000 --- a/solr/contrib/clustering/lib/ehcache-1.6.2.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[85a0ab428be7c8913c120aa932a3d78f705fa73a] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/clustering/lib/hppc-0.3.1.jar b/solr/contrib/clustering/lib/hppc-0.3.1.jar new file mode 100644 index 00000000000..de25884ed0c --- /dev/null +++ b/solr/contrib/clustering/lib/hppc-0.3.1.jar @@ -0,0 +1,2 @@ +AnyObjectId[05c00b3fbfe234cd33477291432af9d172f13e15] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/clustering/lib/jackson-core-asl-0.9.9-6.jar b/solr/contrib/clustering/lib/jackson-core-asl-0.9.9-6.jar deleted file mode 100644 index 13b2de58b96..00000000000 --- a/solr/contrib/clustering/lib/jackson-core-asl-0.9.9-6.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[f6f425d4a0c127d5249d939b7a93b1250d454cdd] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar b/solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar new file mode 100644 index 00000000000..15b207865ed --- /dev/null +++ b/solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar @@ -0,0 +1,2 @@ +AnyObjectId[24107e68fedb0ea04291fa769cf992fc53608c60] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/clustering/lib/jackson-mapper-asl-0.9.9-6.jar b/solr/contrib/clustering/lib/jackson-mapper-asl-0.9.9-6.jar deleted file mode 100644 index 955b2f64f29..00000000000 --- a/solr/contrib/clustering/lib/jackson-mapper-asl-0.9.9-6.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[c1652907ebda1e69895d85730f4fc83e1160306e] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar b/solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar new file mode 100644 index 00000000000..e054dc10a72 --- /dev/null +++ b/solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar @@ -0,0 +1,2 @@ +AnyObjectId[ebf9b5ef21631c4c76e21ae6fc35b0a9f7336d2c] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/clustering/lib/mahout-collections-0.3.jar b/solr/contrib/clustering/lib/mahout-collections-0.3.jar new file mode 100644 index 00000000000..e8a35281f70 --- /dev/null +++ b/solr/contrib/clustering/lib/mahout-collections-0.3.jar @@ -0,0 +1,2 @@ +AnyObjectId[06481add9c7ebce50b75f14d469566906bc0d5a3] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/clustering/lib/mahout-math-0.3.jar b/solr/contrib/clustering/lib/mahout-math-0.3.jar new file mode 100644 index 00000000000..cf74a18768c --- /dev/null +++ b/solr/contrib/clustering/lib/mahout-math-0.3.jar @@ -0,0 +1,2 @@ +AnyObjectId[374e19d07c7f073dc9c84f4dfada8c944389efc0] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/clustering/lib/simple-xml-2.3.5.jar b/solr/contrib/clustering/lib/simple-xml-2.3.5.jar new file mode 100644 index 00000000000..6abdb9ca750 --- /dev/null +++ b/solr/contrib/clustering/lib/simple-xml-2.3.5.jar @@ -0,0 +1,2 @@ +AnyObjectId[f668bc86b1d42264758b54a0395e49eb564dfd27] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java b/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java index 55e23f94cb4..073877c6304 100644 --- a/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java +++ b/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java @@ -58,7 +58,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine { /** * Carrot2 controller that manages instances of clustering algorithms */ - private CachingController controller = new CachingController(); + private Controller controller = ControllerFactory.createPooling(); private Class clusteringAlgorithmClass; private String idFieldName; @@ -91,6 +91,12 @@ public class CarrotClusteringEngine extends SearchClusteringEngine { // Initialize Carrot2 controller. Pass initialization attributes, if any. HashMap initAttributes = new HashMap(); extractCarrotAttributes(initParams, initAttributes); + + // Customize the language model factory. The implementation we provide here + // is included in the code base of Solr, so that it's possible to refactor + // the Lucene APIs the factory relies on if needed. + initAttributes.put("PreprocessingPipeline.languageModelFactory", + new LuceneLanguageModelFactory()); this.controller.init(initAttributes); this.idFieldName = core.getSchema().getUniqueKeyField().getName(); diff --git a/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java b/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java new file mode 100644 index 00000000000..d7b2ace1f6b --- /dev/null +++ b/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java @@ -0,0 +1,353 @@ +package org.apache.solr.handler.clustering.carrot2; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.nio.CharBuffer; +import java.util.HashMap; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ar.ArabicNormalizer; +import org.apache.lucene.analysis.ar.ArabicStemmer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.carrot2.core.LanguageCode; +import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer; +import org.carrot2.text.analysis.ITokenizer; +import org.carrot2.text.linguistic.BaseLanguageModelFactory; +import org.carrot2.text.linguistic.IStemmer; +import org.carrot2.text.linguistic.IdentityStemmer; +import org.carrot2.text.util.MutableCharArray; +import org.carrot2.util.ExceptionUtils; +import org.carrot2.util.ReflectionUtils; +import org.carrot2.util.attribute.Bindable; +import org.slf4j.Logger; +import org.tartarus.snowball.SnowballProgram; +import org.tartarus.snowball.ext.DanishStemmer; +import org.tartarus.snowball.ext.DutchStemmer; +import org.tartarus.snowball.ext.EnglishStemmer; +import org.tartarus.snowball.ext.FinnishStemmer; +import org.tartarus.snowball.ext.FrenchStemmer; +import org.tartarus.snowball.ext.GermanStemmer; +import org.tartarus.snowball.ext.HungarianStemmer; +import org.tartarus.snowball.ext.ItalianStemmer; +import org.tartarus.snowball.ext.NorwegianStemmer; +import org.tartarus.snowball.ext.PortugueseStemmer; +import org.tartarus.snowball.ext.RomanianStemmer; +import org.tartarus.snowball.ext.RussianStemmer; +import org.tartarus.snowball.ext.SpanishStemmer; +import org.tartarus.snowball.ext.SwedishStemmer; +import org.tartarus.snowball.ext.TurkishStemmer; + +/** + * A Solr-specific language model factory for Carrot2. This factory is the only + * element in Carrot2 that depends on Lucene APIs, so should the APIs need to + * change, the changes can be made in this class. + */ +@Bindable(prefix = "DefaultLanguageModelFactory") +public class LuceneLanguageModelFactory extends BaseLanguageModelFactory { + final static Logger logger = org.slf4j.LoggerFactory + .getLogger(LuceneLanguageModelFactory.class); + + /** + * Provide an {@link IStemmer} implementation for a given language. + */ + protected IStemmer createStemmer(LanguageCode language) { + switch (language) { + case ARABIC: + return ArabicStemmerFactory.createStemmer(); + + case CHINESE_SIMPLIFIED: + return IdentityStemmer.INSTANCE; + + default: + /* + * For other languages, try to use snowball's stemming. + */ + return SnowballStemmerFactory.createStemmer(language); + } + } + + @Override + protected ITokenizer createTokenizer(LanguageCode language) { + switch (language) { + case CHINESE_SIMPLIFIED: + return ChineseTokenizerFactory.createTokenizer(); + + /* + * We use our own analyzer for Arabic. Lucene's version has special + * support for Nonspacing-Mark characters (see + * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we + * have them included as letters in the parser. + */ + case ARABIC: + // Intentional fall-through. + + default: + return new ExtendedWhitespaceTokenizer(); + } + } + + /** + * Factory of {@link IStemmer} implementations from the snowball + * project. + */ + private final static class SnowballStemmerFactory { + /** + * Static hard mapping from language codes to stemmer classes in Snowball. + * This mapping is not dynamic because we want to keep the possibility to + * obfuscate these classes. + */ + private static HashMap> snowballStemmerClasses; + static { + snowballStemmerClasses = new HashMap>(); + snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class); + snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class); + snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class); + snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class); + snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class); + snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class); + snowballStemmerClasses + .put(LanguageCode.HUNGARIAN, HungarianStemmer.class); + snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class); + snowballStemmerClasses + .put(LanguageCode.NORWEGIAN, NorwegianStemmer.class); + snowballStemmerClasses.put(LanguageCode.PORTUGUESE, + PortugueseStemmer.class); + snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class); + snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class); + snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class); + snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class); + snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class); + } + + /** + * An adapter converting Snowball programs into {@link IStemmer} interface. + */ + private static class SnowballStemmerAdapter implements IStemmer { + private final SnowballProgram snowballStemmer; + + public SnowballStemmerAdapter(SnowballProgram snowballStemmer) { + this.snowballStemmer = snowballStemmer; + } + + public CharSequence stem(CharSequence word) { + snowballStemmer.setCurrent(word.toString()); + if (snowballStemmer.stem()) { + return snowballStemmer.getCurrent(); + } else { + return null; + } + } + } + + /** + * Create and return an {@link IStemmer} adapter for a + * {@link SnowballProgram} for a given language code. An identity stemmer is + * returned for unknown languages. + */ + public static IStemmer createStemmer(LanguageCode language) { + final Class stemmerClazz = snowballStemmerClasses + .get(language); + + if (stemmerClazz == null) { + logger.warn("No Snowball stemmer class for: " + language.name() + + ". Quality of clustering may be degraded."); + return IdentityStemmer.INSTANCE; + } + + try { + return new SnowballStemmerAdapter(stemmerClazz.newInstance()); + } catch (Exception e) { + logger.warn("Could not instantiate snowball stemmer" + + " for language: " + language.name() + + ". Quality of clustering may be degraded.", e); + + return IdentityStemmer.INSTANCE; + } + } + } + + /** + * Factory of {@link IStemmer} implementations for the + * {@link LanguageCode#ARABIC} language. Requires lucene-contrib + * to be present in classpath, otherwise an empty (identity) stemmer is + * returned. + */ + private static class ArabicStemmerFactory { + static { + try { + ReflectionUtils.classForName(ArabicStemmer.class.getName(), false); + ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false); + } catch (ClassNotFoundException e) { + logger + .warn( + "Could not instantiate Lucene stemmer for Arabic, clustering quality " + + "of Chinese content may be degraded. For best quality clusters, " + + "make sure Lucene's Arabic analyzer JAR is in the classpath", + e); + } + } + + /** + * Adapter to lucene-contrib Arabic analyzers. + */ + private static class LuceneStemmerAdapter implements IStemmer { + private final org.apache.lucene.analysis.ar.ArabicStemmer delegate; + private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer; + + private char[] buffer = new char[0]; + + private LuceneStemmerAdapter() throws Exception { + delegate = new org.apache.lucene.analysis.ar.ArabicStemmer(); + normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer(); + } + + public CharSequence stem(CharSequence word) { + if (word.length() > buffer.length) { + buffer = new char[word.length()]; + } + + for (int i = 0; i < word.length(); i++) { + buffer[i] = word.charAt(i); + } + + int newLen = normalizer.normalize(buffer, word.length()); + newLen = delegate.stem(buffer, newLen); + + if (newLen != word.length() || !equals(buffer, newLen, word)) { + return CharBuffer.wrap(buffer, 0, newLen); + } + + // Same-same. + return null; + } + + private boolean equals(char[] buffer, int len, CharSequence word) { + assert len == word.length(); + + for (int i = 0; i < len; i++) { + if (buffer[i] != word.charAt(i)) + return false; + } + + return true; + } + } + + public static IStemmer createStemmer() { + try { + return new LuceneStemmerAdapter(); + } catch (Throwable e) { + return IdentityStemmer.INSTANCE; + } + } + } + + /** + * Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's + * {@link ITokenizer}. If Smart Chinese is not available in the classpath, the + * factory will fall back to the default white space tokenizer. + */ + private static final class ChineseTokenizerFactory { + static { + try { + ReflectionUtils.classForName( + "org.apache.lucene.analysis.cn.smart.WordTokenFilter", false); + ReflectionUtils.classForName( + "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false); + } catch (Throwable e) { + logger + .warn("Could not instantiate Smart Chinese Analyzer, clustering quality " + + "of Chinese content may be degraded. For best quality clusters, " + + "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath"); + } + } + + static ITokenizer createTokenizer() { + try { + return new ChineseTokenizer(); + } catch (Throwable e) { + return new ExtendedWhitespaceTokenizer(); + } + } + + private final static class ChineseTokenizer implements ITokenizer { + private final static Pattern numeric = Pattern + .compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?"); + + private Tokenizer sentenceTokenizer; + private TokenStream wordTokenFilter; + private CharTermAttribute term = null; + + private final MutableCharArray tempCharSequence; + private final Class tokenFilterClass; + + private ChineseTokenizer() throws Exception { + this.tempCharSequence = new MutableCharArray(new char[0]); + + // As Smart Chinese is not available during compile time, + // we need to resort to reflection. + final Class tokenizerClass = ReflectionUtils + .classForName("org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false); + this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor( + Reader.class).newInstance((Reader) null); + this.tokenFilterClass = ReflectionUtils + .classForName("org.apache.lucene.analysis.cn.smart.WordTokenFilter", false); + } + + public short nextToken() throws IOException { + final boolean hasNextToken = wordTokenFilter.incrementToken(); + if (hasNextToken) { + short flags = 0; + final char[] image = term.buffer(); + final int length = term.length(); + tempCharSequence.reset(image, 0, length); + if (length == 1 && image[0] == ',') { + // ChineseTokenizer seems to convert all punctuation to ',' + // characters + flags = ITokenizer.TT_PUNCTUATION; + } else if (numeric.matcher(tempCharSequence).matches()) { + flags = ITokenizer.TT_NUMERIC; + } else { + flags = ITokenizer.TT_TERM; + } + return flags; + } + + return ITokenizer.TT_EOF; + } + + public void setTermBuffer(MutableCharArray array) { + array.reset(term.buffer(), 0, term.length()); + } + + public void reset(Reader input) throws IOException { + try { + sentenceTokenizer.reset(input); + wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor( + TokenStream.class).newInstance(sentenceTokenizer); + } catch (Exception e) { + throw ExceptionUtils.wrapAsRuntimeException(e); + } + } + } + } +} diff --git a/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java b/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java index bec4da10476..bece45d1ae3 100644 --- a/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java +++ b/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java @@ -46,7 +46,10 @@ import static org.junit.Assert.*; public class CarrotClusteringEngineTest extends AbstractClusteringTest { @Test public void testCarrotLingo() throws Exception { - checkEngine(getClusteringEngine("default"), 10); + // Note: the expected number of clusters may change after upgrading Carrot2 + // due to e.g. internal improvements or tuning of Carrot2 clustering. + final int expectedNumClusters = 10; + checkEngine(getClusteringEngine("default"), expectedNumClusters); } @Test @@ -54,7 +57,11 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTest { ModifiableSolrParams solrParams = new ModifiableSolrParams(); solrParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet"); solrParams.add(CarrotParams.SUMMARY_FRAGSIZE, "200");//how do we validate this? - checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, 15, new TermQuery(new Term("snippet", "mine")), solrParams); + + // Note: the expected number of clusters may change after upgrading Carrot2 + // due to e.g. internal improvements or tuning of Carrot2 clustering. + final int expectedNumClusters = 15; + checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, expectedNumClusters, new TermQuery(new Term("snippet", "mine")), solrParams); } @Test diff --git a/solr/example/solr/conf/solrconfig.xml b/solr/example/solr/conf/solrconfig.xml index 4436d38c679..6e06eb6b73b 100755 --- a/solr/example/solr/conf/solrconfig.xml +++ b/solr/example/solr/conf/solrconfig.xml @@ -56,7 +56,6 @@ - 20 + + + ENGLISH stc diff --git a/solr/lib/google-collect-1.0.jar b/solr/lib/google-collect-1.0.jar deleted file mode 100644 index a6e52bafaef..00000000000 --- a/solr/lib/google-collect-1.0.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[a7abdbbdb4efdd6b3bf14075f83af01da0841487] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/lib/guava-r05.jar b/solr/lib/guava-r05.jar new file mode 100644 index 00000000000..c025ed3a543 --- /dev/null +++ b/solr/lib/guava-r05.jar @@ -0,0 +1,2 @@ +AnyObjectId[3aea56f8076551e8a5b631d8cc3d40190089f0a8] was removed in git history. +Apache SVN contains full history. \ No newline at end of file