SOLR-2448: Upgrade of Carrot2 to version 3.5.0 and a number of related clustering improvements (SOLR-2449, SOLR-2450, SOLR-2505)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1103722 13f79535-47bb-0310-9956-ffa450edef68
2011-05-16 13:19:46 +00:00 · 2011-05-16 13:19:46 +00:00 · 548806b7f7
parent 77ac8172af
commit 548806b7f7
22 changed files with 963 additions and 452 deletions
--- a/dev-tools/maven/pom.xml.template
+++ b/dev-tools/maven/pom.xml.template
@ -106,14 +106,6 @@
    </license>
  </licenses>
  <repositories>
-    <repository>
-      <id>carrot2.org</id>
-      <name>Carrot2 Maven2 repository</name>
-      <url>http://download.carrot2.org/maven2/</url>
-      <snapshots>
-        <updatePolicy>never</updatePolicy>
-      </snapshots>
-    </repository>
    <repository>
      <id>apache.snapshots</id>
      <name>Apache Snapshot Repository</name>
@ -306,7 +298,7 @@
      <dependency>
        <groupId>org.carrot2</groupId>
        <artifactId>carrot2-core</artifactId>
-        <version>3.4.2</version>
+        <version>3.5.0</version>
      </dependency>
      <dependency>
        <groupId>org.codehaus.woodstox</groupId>
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -26,7 +26,7 @@ Versions of Major Components
 ---------------------
 Apache Lucene trunk
 Apache Tika 0.8
-Carrot2 3.4.2
+Carrot2 3.5.0
 Velocity 1.6.4 and Velocity Tools 2.0
 Apache UIMA 2.3.1-SNAPSHOT

--- a/solr/contrib/clustering/CHANGES.txt
+++ b/solr/contrib/clustering/CHANGES.txt
@ -9,11 +9,19 @@ CHANGES
 $Id$
 ================== Release 4.0.0-dev ==================

-(No Changes)
+* SOLR-2448: Search results clustering updates: bisecting k-means
+  clustering algorithm added, loading of Carrot2 stop words from
+  <solr.home>/conf/carrot2 (SOLR-2449), using Solr's stopwords.txt
+  for clustering (SOLR-2450), output of cluster scores (SOLR-2505)
+  (Stanislaw Osinski, Dawid Weiss).

 ================== Release 3.2.0-dev ==================

-(No Changes)
+* SOLR-2448: Search results clustering updates: bisecting k-means
+  clustering algorithm added, loading of Carrot2 stop words from
+  <solr.home>/conf/carrot2 (SOLR-2449), using Solr's stopwords.txt
+  for clustering (SOLR-2450), output of cluster scores (SOLR-2505)
+  (Stanislaw Osinski, Dawid Weiss).

 ================== Release 3.1.0-dev ==================

--- a/solr/contrib/clustering/lib/carrot2-core-3.4.2.jar
+++ b/solr/contrib/clustering/lib/carrot2-core-3.4.2.jar
@ -1,2 +0,0 @@
-AnyObjectId[f872cbc8eec94f7d5b29a73f99cd13089848a3cd] was removed in git history.
-Apache SVN contains full history.
--- a/solr/contrib/clustering/lib/carrot2-core-3.5.0.jar
+++ b/solr/contrib/clustering/lib/carrot2-core-3.5.0.jar
@ -0,0 +1,2 @@
+AnyObjectId[adc127c48137d03e252f526de84a07c8d6bda521] was removed in git history.
+Apache SVN contains full history.
--- a/solr/contrib/clustering/lib/hppc-0.3.1.jar
+++ b/solr/contrib/clustering/lib/hppc-0.3.1.jar
@ -1,2 +0,0 @@
-AnyObjectId[05c00b3fbfe234cd33477291432af9d172f13e15] was removed in git history.
-Apache SVN contains full history.
--- a/solr/contrib/clustering/lib/hppc-0.3.3.jar
+++ b/solr/contrib/clustering/lib/hppc-0.3.3.jar
@ -0,0 +1,2 @@
+AnyObjectId[0da24b80aab135dc5811731b4e8aa69a77256d8a] was removed in git history.
+Apache SVN contains full history.
--- a/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
+++ b/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
@ -18,9 +18,11 @@ package org.apache.solr.handler.clustering.carrot2;
 */

 import java.io.IOException;
+import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@ -37,6 +39,7 @@ import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SimpleOrderedMap;
 import org.apache.solr.core.SolrCore;
+import org.apache.solr.core.SolrResourceLoader;
 import org.apache.solr.handler.clustering.SearchClusteringEngine;
 import org.apache.solr.handler.component.HighlightComponent;
 import org.apache.solr.highlight.SolrHighlighter;
@ -52,9 +55,17 @@ import org.carrot2.core.ControllerFactory;
 import org.carrot2.core.Document;
 import org.carrot2.core.IClusteringAlgorithm;
 import org.carrot2.core.attribute.AttributeNames;
+import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
+import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
+import org.carrot2.util.resource.ClassLoaderLocator;
+import org.carrot2.util.resource.IResource;
+import org.carrot2.util.resource.IResourceLocator;
+import org.carrot2.util.resource.ResourceLookup;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;

 /**
@ -64,19 +75,33 @@ import com.google.common.collect.Sets;
 *
 * @link http://project.carrot2.org
 */
-@SuppressWarnings("unchecked")
 public class CarrotClusteringEngine extends SearchClusteringEngine {
-  private transient static Logger log = LoggerFactory
+	private transient static Logger log = LoggerFactory
          .getLogger(CarrotClusteringEngine.class);

+	/**
+	 * The subdirectory in Solr config dir to read customized Carrot2 resources from.
+	 */
+	private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
+
+  /**
+   * Name of Carrot2 document's field containing Solr document's identifier.
+   */
+  private static final String SOLR_DOCUMENT_ID = "solrId";
+
+  /**
+   * Name of Solr document's field containing the document's identifier. To avoid
+   * repeating the content of documents in clusters on output, each cluster contains
+   * identifiers of documents it contains.
+   */
+  private String idFieldName;
+
  /**
   * Carrot2 controller that manages instances of clustering algorithms
   */
  private Controller controller = ControllerFactory.createPooling();
  private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;

-  private String idFieldName;
-
  @Override
  @Deprecated
  public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
@ -101,6 +126,10 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
      attributes.put(AttributeNames.DOCUMENTS, documents);
      attributes.put(AttributeNames.QUERY, query.toString());

+      // Pass the fields on which clustering runs to the
+      // SolrStopwordsCarrot2LexicalDataFactory
+      attributes.put("solrFieldNames", getFieldsForClustering(sreq));
+
      // Pass extra overriding attributes from the request, if any
      extractCarrotAttributes(sreq.getParams(), attributes);

@ -113,22 +142,68 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
    }
  }

-  @Override
+	@Override
+	@SuppressWarnings({ "unchecked", "rawtypes" })
  public String init(NamedList config, final SolrCore core) {
    String result = super.init(config, core);
-    SolrParams initParams = SolrParams.toSolrParams(config);
+    final SolrParams initParams = SolrParams.toSolrParams(config);

    // Initialize Carrot2 controller. Pass initialization attributes, if any.
    HashMap<String, Object> initAttributes = new HashMap<String, Object>();
    extractCarrotAttributes(initParams, initAttributes);

-    // Customize the language model factory. The implementation we provide here
-    // is included in the code base of Solr, so that it's possible to refactor
-    // the Lucene APIs the factory relies on if needed.
-    initAttributes.put("PreprocessingPipeline.languageModelFactory",
-      LuceneLanguageModelFactory.class);
-    this.controller.init(initAttributes);
+    // Customize the stemmer and tokenizer factories. The implementations we provide here
+    // are included in the code base of Solr, so that it's possible to refactor
+    // the Lucene APIs the factories rely on if needed.
+    // Additionally, we set a custom lexical resource factory for Carrot2 that
+    // will use both Carrot2 default stop words as well as stop words from
+    // the StopFilter defined on the field.
+		BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
+				.stemmerFactory(LuceneCarrot2StemmerFactory.class)
+				.tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
+				.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);

+		// Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
+		initAttributes.put("solrIndexSchema", core.getSchema());
+
+    // Customize Carrot2's resource lookup to first look for resources
+    // using Solr's resource loader. If that fails, try loading from the classpath.
+    DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes)
+        .resourceLookup(new ResourceLookup(new IResourceLocator() {
+          @Override
+          public IResource[] getAll(final String resource) {
+            final SolrResourceLoader resourceLoader = core.getResourceLoader();
+            final String carrot2ResourcesDir = resourceLoader.getConfigDir()
+                + initParams.get(CarrotParams.LEXICAL_RESOURCES_DIR, CARROT_RESOURCES_PREFIX);
+            try {
+              log.debug("Looking for " + resource + " in "
+                  + carrot2ResourcesDir);
+              final InputStream resourceStream = resourceLoader
+                  .openResource(carrot2ResourcesDir + "/" + resource);
+
+              log.info(resource + " loaded from " + carrot2ResourcesDir);
+              final IResource foundResource = new IResource() {
+                @Override
+                public InputStream open() throws IOException {
+                  return resourceStream;
+                }
+              };
+              return new IResource[] { foundResource };
+            } catch (RuntimeException e) {
+              // No way to distinguish if the resource was found but failed
+              // to load or wasn't found at all, so we simply fall back
+              // to Carrot2 defaults here by returning an empty locations array.
+              log.debug(resource + " not found in " + carrot2ResourcesDir
+                  + ". Using the default " + resource + " from Carrot JAR.");
+              return new IResource[] {};
+            }
+          }
+        },
+
+        // Using the class loader directly because this time we want to omit the prefix
+        new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
+
+    this.controller.init(initAttributes);
    this.idFieldName = core.getSchema().getUniqueKeyField().getName();

    // Make sure the requested Carrot2 clustering algorithm class is available
@ -148,16 +223,28 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
  protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
    SolrParams solrParams = sreq.getParams();

-    // Names of fields to deliver content for clustering
-    String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
+    HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
+    fields.add(idFieldName);
+    fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
+		return fields;
+  }
+
+	/**
+	 * Returns the names of fields that will be delivering the actual
+	 * content for clustering. Currently, there are two such fields: document
+	 * title and document content.
+	 */
+	private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
+    SolrParams solrParams = sreq.getParams();
+
    String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
    String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
    if (StringUtils.isBlank(snippetField)) {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
              + " must not be blank.");
    }
-    return Sets.newHashSet(urlField, titleField, snippetField, idFieldName);
-  }
+    return Sets.newHashSet(titleField, snippetField);
+	}

  /**
   * Prepares Carrot2 documents for clustering.
@ -180,7 +267,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
    if (produceSummary == true) {
      highlighter = HighlightComponent.getHighlighter(core);
      if (highlighter != null){
-        Map args = new HashMap();
+        Map<String, Object> args = Maps.newHashMap();
        snippetFieldAry = new String[]{snippetField};
        args.put(HighlightParams.FIELDS, snippetFieldAry);
        args.put(HighlightParams.HIGHLIGHT, "true");
@ -214,11 +301,12 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
      if (produceSummary && docIds != null) {
        docsHolder[0] = docIds.get(sdoc).intValue();
        DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
-        NamedList highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
+        NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
        if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
          //should only be one document with one field
-          NamedList tmp = (NamedList) highlights.getVal(0);
-          String [] highlt = (String[]) tmp.get(snippetField);
+          @SuppressWarnings("unchecked")
+					NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
+          String [] highlt = tmp.get(snippetField);
          if (highlt != null && highlt.length == 1) {
            snippet = highlt[0];
          }
@ -226,27 +314,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
      }
      Document carrotDocument = new Document(getValue(sdoc, titleField),
              snippet, (String)sdoc.getFieldValue(urlField));
-      carrotDocument.setField("solrId", sdoc.getFieldValue(idFieldName));
+      carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
      result.add(carrotDocument);
    }

    return result;
  }

-  @Deprecated
-  protected String getValue(org.apache.lucene.document.Document doc,
-                            String field) {
-    StringBuilder result = new StringBuilder();
-    String[] vals = doc.getValues(field);
-    for (int i = 0; i < vals.length; i++) {
-      // Join multiple values with a period so that Carrot2 does not pick up
-      // phrases that cross field value boundaries (in most cases it would
-      // create useless phrases).
-      result.append(vals[i]).append(" . ");
-    }
-    return result.toString().trim();
-  }
-
  protected String getValue(SolrDocument sdoc, String field) {
    StringBuilder result = new StringBuilder();
    Collection<Object> vals = sdoc.getFieldValues(field);
@ -261,9 +335,9 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
    return result.toString().trim();
  }

-  private List clustersToNamedList(List<Cluster> carrotClusters,
+  private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
                                   SolrParams solrParams) {
-    List result = new ArrayList();
+    List<NamedList<Object>> result = Lists.newArrayList();
    clustersToNamedList(carrotClusters, result, solrParams.getBool(
            CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
            CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
@ -271,25 +345,40 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
  }

  private void clustersToNamedList(List<Cluster> outputClusters,
-                                   List parent, boolean outputSubClusters, int maxLabels) {
+                                   List<NamedList<Object>> parent, boolean outputSubClusters, int maxLabels) {
    for (Cluster outCluster : outputClusters) {
-      NamedList cluster = new SimpleOrderedMap();
+      NamedList<Object> cluster = new SimpleOrderedMap<Object>();
      parent.add(cluster);

+      // Add labels
      List<String> labels = outCluster.getPhrases();
-      if (labels.size() > maxLabels)
+      if (labels.size() > maxLabels) {
        labels = labels.subList(0, maxLabels);
+      }
      cluster.add("labels", labels);

-      List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
-      List docList = new ArrayList();
-      cluster.add("docs", docList);
-      for (Document doc : docs) {
-        docList.add(doc.getField("solrId"));
+      // Add cluster score
+      final Double score = outCluster.getScore();
+      if (score != null) {
+        cluster.add("score", score);
      }

-      if (outputSubClusters) {
-        List subclusters = new ArrayList();
+      // Add other topics marker
+      if (outCluster.isOtherTopics()) {
+        cluster.add("other-topics", outCluster.isOtherTopics());
+      }
+
+      // Add documents
+      List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
+      List<Object> docList = Lists.newArrayList();
+      cluster.add("docs", docList);
+      for (Document doc : docs) {
+        docList.add(doc.getField(SOLR_DOCUMENT_ID));
+      }
+
+      // Add subclusters
+      if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
+        List<NamedList<Object>> subclusters = Lists.newArrayList();
        cluster.add("clusters", subclusters);
        clustersToNamedList(outCluster.getSubclusters(), subclusters,
                outputSubClusters, maxLabels);
--- a/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
+++ b/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
@ -35,6 +35,8 @@ public interface CarrotParams {
  String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
  String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragzise";

+  String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
+
  public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
          ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
          PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, SUMMARY_FRAGSIZE);
--- a/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java
+++ b/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java
@ -17,27 +17,15 @@ package org.apache.solr.handler.clustering.carrot2;
 * limitations under the License.
 */

-import java.io.IOException;
-import java.io.Reader;
 import java.nio.CharBuffer;
 import java.util.HashMap;
-import java.util.regex.Pattern;

-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ar.ArabicNormalizer;
 import org.apache.lucene.analysis.ar.ArabicStemmer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.carrot2.core.LanguageCode;
-import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
-import org.carrot2.text.analysis.ITokenizer;
-import org.carrot2.text.linguistic.DefaultLanguageModelFactory;
 import org.carrot2.text.linguistic.IStemmer;
-import org.carrot2.text.linguistic.IdentityStemmer;
-import org.carrot2.text.util.MutableCharArray;
-import org.carrot2.util.ExceptionUtils;
+import org.carrot2.text.linguistic.IStemmerFactory;
 import org.carrot2.util.ReflectionUtils;
-import org.carrot2.util.attribute.Bindable;
 import org.slf4j.Logger;
 import org.tartarus.snowball.SnowballProgram;
 import org.tartarus.snowball.ext.DanishStemmer;
@ -57,20 +45,16 @@ import org.tartarus.snowball.ext.SwedishStemmer;
 import org.tartarus.snowball.ext.TurkishStemmer;

 /**
- * A Solr-specific language model factory for Carrot2. This factory is the only
- * element in Carrot2 that depends on Lucene APIs, so should the APIs need to
- * change, the changes can be made in this class.
+ * An implementation of Carrot2's {@link IStemmerFactory} based on Lucene's
+ * APIs. Should the relevant Lucene APIs need to change, the changes can be made
+ * in this class.
 */
-@Bindable(prefix = "DefaultLanguageModelFactory")
-public class LuceneLanguageModelFactory extends DefaultLanguageModelFactory {
+public class LuceneCarrot2StemmerFactory implements IStemmerFactory {
 	final static Logger logger = org.slf4j.LoggerFactory
-			.getLogger(LuceneLanguageModelFactory.class);
+			.getLogger(LuceneCarrot2StemmerFactory.class);

-	/**
-	 * Provide an {@link IStemmer} implementation for a given language.
-	 */
 	@Override
-  protected IStemmer createStemmer(LanguageCode language) {
+	public IStemmer getStemmer(LanguageCode language) {
 		switch (language) {
 		case ARABIC:
 			return ArabicStemmerFactory.createStemmer();
@ -86,26 +70,6 @@ public class LuceneLanguageModelFactory extends DefaultLanguageModelFactory {
 		}
 	}

-	@Override
-	protected ITokenizer createTokenizer(LanguageCode language) {
-		switch (language) {
-		case CHINESE_SIMPLIFIED:
-			return ChineseTokenizerFactory.createTokenizer();
-
-			/*
-			 * We use our own analyzer for Arabic. Lucene's version has special
-			 * support for Nonspacing-Mark characters (see
-			 * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
-			 * have them included as letters in the parser.
-			 */
-		case ARABIC:
-			// Intentional fall-through.
-
-		default:
-			return new ExtendedWhitespaceTokenizer();
-		}
-	}
-
 	/**
 	 * Factory of {@link IStemmer} implementations from the <code>snowball</code>
 	 * project.
@ -263,92 +227,15 @@ public class LuceneLanguageModelFactory extends DefaultLanguageModelFactory {
 	}

 	/**
-	 * Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
-	 * {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
-	 * factory will fall back to the default white space tokenizer.
+	 * An implementation of {@link IStemmer} that always returns <code>null</code>
+	 * which means no stemming.
 	 */
-	private static final class ChineseTokenizerFactory {
-		static {
-			try {
-				ReflectionUtils.classForName(
-						"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
-				ReflectionUtils.classForName(
-						"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
-			} catch (Throwable e) {
-				logger
-						.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
-								+ "of Chinese content may be degraded. For best quality clusters, "
-								+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
-			}
-		}
+	private static class IdentityStemmer implements IStemmer {
+		private final static IdentityStemmer INSTANCE = new IdentityStemmer();

-		static ITokenizer createTokenizer() {
-			try {
-				return new ChineseTokenizer();
-			} catch (Throwable e) {
-				return new ExtendedWhitespaceTokenizer();
-			}
-		}
-
-		private final static class ChineseTokenizer implements ITokenizer {
-			private final static Pattern numeric = Pattern
-					.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
-
-			private Tokenizer sentenceTokenizer;
-			private TokenStream wordTokenFilter;
-			private CharTermAttribute term = null;
-
-			private final MutableCharArray tempCharSequence;
-			private final Class<?> tokenFilterClass;
-
-			private ChineseTokenizer() throws Exception {
-				this.tempCharSequence = new MutableCharArray(new char[0]);
-
-				// As Smart Chinese is not available during compile time,
-				// we need to resort to reflection.
-				final Class<?> tokenizerClass = ReflectionUtils
-						.classForName("org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
-				this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
-						Reader.class).newInstance((Reader) null);
-				this.tokenFilterClass = ReflectionUtils
-						.classForName("org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
-			}
-
-			public short nextToken() throws IOException {
-				final boolean hasNextToken = wordTokenFilter.incrementToken();
-				if (hasNextToken) {
-					short flags = 0;
-					final char[] image = term.buffer();
-					final int length = term.length();
-					tempCharSequence.reset(image, 0, length);
-					if (length == 1 && image[0] == ',') {
-						// ChineseTokenizer seems to convert all punctuation to ','
-						// characters
-						flags = ITokenizer.TT_PUNCTUATION;
-					} else if (numeric.matcher(tempCharSequence).matches()) {
-						flags = ITokenizer.TT_NUMERIC;
-					} else {
-						flags = ITokenizer.TT_TERM;
-					}
-					return flags;
-				}
-
-				return ITokenizer.TT_EOF;
-			}
-
-			public void setTermBuffer(MutableCharArray array) {
-				array.reset(term.buffer(), 0, term.length());
-			}
-
-			public void reset(Reader input) throws IOException {
-				try {
-					sentenceTokenizer.reset(input);
-					wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
-							TokenStream.class).newInstance(sentenceTokenizer);
-				} catch (Exception e) {
-					throw ExceptionUtils.wrapAsRuntimeException(e);
-				}
-			}
+		@Override
+		public CharSequence stem(CharSequence word) {
+			return null;
 		}
 	}
 }
--- a/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java
+++ b/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java
@ -0,0 +1,156 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
+import org.carrot2.text.analysis.ITokenizer;
+import org.carrot2.text.linguistic.ITokenizerFactory;
+import org.carrot2.text.util.MutableCharArray;
+import org.carrot2.util.ExceptionUtils;
+import org.carrot2.util.ReflectionUtils;
+import org.slf4j.Logger;
+
+/**
+ * An implementation of Carrot2's {@link ITokenizerFactory} based on Lucene's
+ * Smart Chinese tokenizer. If Smart Chinese tokenizer is not available in
+ * classpath at runtime, the default Carrot2's tokenizer is used. Should the
+ * Lucene APIs need to change, the changes can be made in this class.
+ */
+public class LuceneCarrot2TokenizerFactory implements ITokenizerFactory {
+	final static Logger logger = org.slf4j.LoggerFactory
+			.getLogger(LuceneCarrot2TokenizerFactory.class);
+
+	@Override
+	public ITokenizer getTokenizer(LanguageCode language) {
+		switch (language) {
+		case CHINESE_SIMPLIFIED:
+			return ChineseTokenizerFactory.createTokenizer();
+
+			/*
+			 * We use our own analyzer for Arabic. Lucene's version has special
+			 * support for Nonspacing-Mark characters (see
+			 * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
+			 * have them included as letters in the parser.
+			 */
+		case ARABIC:
+			// Intentional fall-through.
+
+		default:
+			return new ExtendedWhitespaceTokenizer();
+		}
+	}
+
+	/**
+	 * Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
+	 * {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
+	 * factory will fall back to the default white space tokenizer.
+	 */
+	private static final class ChineseTokenizerFactory {
+		static {
+			try {
+				ReflectionUtils.classForName(
+						"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+				ReflectionUtils.classForName(
+						"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+			} catch (Throwable e) {
+				logger
+						.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
+								+ "of Chinese content may be degraded. For best quality clusters, "
+								+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
+			}
+		}
+
+		static ITokenizer createTokenizer() {
+			try {
+				return new ChineseTokenizer();
+			} catch (Throwable e) {
+				return new ExtendedWhitespaceTokenizer();
+			}
+		}
+
+		private final static class ChineseTokenizer implements ITokenizer {
+			private final static Pattern numeric = Pattern
+					.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
+
+			private Tokenizer sentenceTokenizer;
+			private TokenStream wordTokenFilter;
+			private CharTermAttribute term = null;
+
+			private final MutableCharArray tempCharSequence;
+			private final Class<?> tokenFilterClass;
+
+			private ChineseTokenizer() throws Exception {
+				this.tempCharSequence = new MutableCharArray(new char[0]);
+
+				// As Smart Chinese is not available during compile time,
+				// we need to resort to reflection.
+				final Class<?> tokenizerClass = ReflectionUtils.classForName(
+						"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+				this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
+						Reader.class).newInstance((Reader) null);
+				this.tokenFilterClass = ReflectionUtils.classForName(
+						"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+			}
+
+			public short nextToken() throws IOException {
+				final boolean hasNextToken = wordTokenFilter.incrementToken();
+				if (hasNextToken) {
+					short flags = 0;
+					final char[] image = term.buffer();
+					final int length = term.length();
+					tempCharSequence.reset(image, 0, length);
+					if (length == 1 && image[0] == ',') {
+						// ChineseTokenizer seems to convert all punctuation to ','
+						// characters
+						flags = ITokenizer.TT_PUNCTUATION;
+					} else if (numeric.matcher(tempCharSequence).matches()) {
+						flags = ITokenizer.TT_NUMERIC;
+					} else {
+						flags = ITokenizer.TT_TERM;
+					}
+					return flags;
+				}
+
+				return ITokenizer.TT_EOF;
+			}
+
+			public void setTermBuffer(MutableCharArray array) {
+				array.reset(term.buffer(), 0, term.length());
+			}
+
+			public void reset(Reader input) throws IOException {
+				try {
+					sentenceTokenizer.reset(input);
+					wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
+							TokenStream.class).newInstance(sentenceTokenizer);
+          term = wordTokenFilter.addAttribute(CharTermAttribute.class);
+				} catch (Exception e) {
+					throw ExceptionUtils.wrapAsRuntimeException(e);
+				}
+			}
+		}
+	}
+}
--- a/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
+++ b/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
@ -0,0 +1,141 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Collection;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.solr.analysis.CommonGramsFilterFactory;
+import org.apache.solr.analysis.StopFilterFactory;
+import org.apache.solr.analysis.TokenFilterFactory;
+import org.apache.solr.analysis.TokenizerChain;
+import org.apache.solr.schema.IndexSchema;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.core.attribute.Init;
+import org.carrot2.core.attribute.Processing;
+import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
+import org.carrot2.text.linguistic.ILexicalData;
+import org.carrot2.text.linguistic.ILexicalDataFactory;
+import org.carrot2.text.util.MutableCharArray;
+import org.carrot2.util.attribute.Attribute;
+import org.carrot2.util.attribute.Bindable;
+import org.carrot2.util.attribute.Input;
+import org.slf4j.Logger;
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
+
+/**
+ * An implementation of Carrot2's {@link ILexicalDataFactory} that adds stop
+ * words from a field's StopFilter to the default stop words used in Carrot2,
+ * for all languages Carrot2 supports. Completely replacing Carrot2 stop words
+ * with Solr's wouldn't make much sense because clustering needs more aggressive
+ * stop words removal. In other words, if something is a stop word during
+ * indexing, then it should also be a stop word during clustering, but not the
+ * other way round.
+ */
+@Bindable
+public class SolrStopwordsCarrot2LexicalDataFactory implements
+		ILexicalDataFactory {
+	final static Logger logger = org.slf4j.LoggerFactory
+			.getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);
+
+	@Init
+	@Input
+	@Attribute(key = "solrIndexSchema")
+	private IndexSchema schema;
+
+	@Processing
+	@Input
+	@Attribute(key = "solrFieldNames")
+	private Set<String> fieldNames;
+
+	/**
+	 * A lazily-built cache of stop words per field.
+	 */
+	private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
+
+	/**
+	 * Carrot2's default lexical resources to use in addition to Solr's stop
+	 * words.
+	 */
+	private DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
+
+	/**
+	 * Obtains stop words for a field from the associated
+	 * {@link StopFilterFactory}, if any.
+	 */
+	private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
+		// No need to synchronize here, Carrot2 ensures that instances
+		// of this class are not used by multiple threads at a time.
+		if (!solrStopWords.containsKey(fieldName)) {
+			final Analyzer fieldAnalyzer = schema.getFieldType(fieldName)
+					.getAnalyzer();
+			if (fieldAnalyzer instanceof TokenizerChain) {
+				final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
+						.getTokenFilterFactories();
+				for (TokenFilterFactory factory : filterFactories) {
+					if (factory instanceof StopFilterFactory) {
+						// StopFilterFactory holds the stop words in a CharArraySet, but
+						// the getStopWords() method returns a Set<?>, so we need to cast.
+						solrStopWords.put(fieldName,
+								(CharArraySet) ((StopFilterFactory) factory).getStopWords());
+					}
+
+					if (factory instanceof CommonGramsFilterFactory) {
+						solrStopWords.put(fieldName,
+								(CharArraySet) ((CommonGramsFilterFactory) factory)
+										.getCommonWords());
+					}
+				}
+			}
+		}
+		return solrStopWords.get(fieldName);
+	}
+
+	@Override
+	public ILexicalData getLexicalData(LanguageCode languageCode) {
+		final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
+				.getLexicalData(languageCode);
+
+		return new ILexicalData() {
+			@Override
+			public boolean isStopLabel(CharSequence word) {
+				// Nothing in Solr maps to the concept of a stop label,
+				// so return Carrot2's default here.
+				return carrot2LexicalData.isStopLabel(word);
+			}
+
+			@Override
+			public boolean isCommonWord(MutableCharArray word) {
+				// Loop over the fields involved in clustering first
+				for (String fieldName : fieldNames) {
+					for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
+						if (stopWords.contains(word)) {
+							return true;
+						}
+					}
+				}
+				// Check default Carrot2 stop words too
+				return carrot2LexicalData.isCommonWord(word);
+			}
+		};
+	}
+}
--- a/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
+++ b/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
@ -17,6 +17,11 @@ package org.apache.solr.handler.clustering.carrot2;
 * limitations under the License.
 */

+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
@ -37,15 +42,11 @@ import org.apache.solr.util.SolrPluginUtils;
 import org.carrot2.util.attribute.AttributeUtils;
 import org.junit.Test;

-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import com.google.common.collect.ImmutableList;

 /**
 *
 */
-@SuppressWarnings("unchecked")
 public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
  @Test
  public void testCarrotLingo() throws Exception {
@ -74,7 +75,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {

  @Test
  public void testWithoutSubclusters() throws Exception {
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs),
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs),
            1, 1, 0);
  }

@ -82,7 +83,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
  public void testWithSubclusters() throws Exception {
    ModifiableSolrParams params = new ModifiableSolrParams();
    params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true);
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs), 1, 1, 2);
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs), 1, 1, 2);
  }

  @Test
@ -90,19 +91,107 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
    ModifiableSolrParams params = new ModifiableSolrParams();
    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5);
    params.set(CarrotParams.NUM_DESCRIPTIONS, 3);
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
            params), 1, 3, 0);
  }

+  @Test
+  public void testClusterScores() throws Exception {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+    List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+        AbstractClusteringTestCase.numberOfDocs, params);
+    int i = 1;
+    for (NamedList<Object> cluster : clusters) {
+      final Double score = getScore(cluster);
+      assertNotNull(score);
+      assertEquals(0.25 * i++, score, 0);
+    }
+  }
+
+  @Test
+  public void testOtherTopics() throws Exception {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "otherTopicsModulo"), 2);
+    List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+        AbstractClusteringTestCase.numberOfDocs, params);
+    int i = 1;
+    for (NamedList<Object> cluster : clusters) {
+      assertEquals(i++ % 2 == 0 ? true : null, isOtherTopics(cluster));
+    }
+  }
+
  @Test
  public void testCarrotAttributePassing() throws Exception {
    ModifiableSolrParams params = new ModifiableSolrParams();
    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3);
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
            params), 1, 3, 0);
  }

+	@Test
+	public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
+		checkLexicalResourcesFromSolrConfig("lexical-resource-check",
+				"online,customsolrstopword,customsolrstoplabel");
+	}
+
+	@Test
+	public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
+		checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
+				"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
+	}
+
+	private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
+			throws IOException {
+		ModifiableSolrParams params = new ModifiableSolrParams();
+		params.set("merge-resources", false);
+		params.set(AttributeUtils.getKey(
+				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+				wordsToCheck);
+
+		// "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
+		// stoplabels.en, so we're expecting only one cluster with label "online".
+		final List<NamedList<Object>> clusters = checkEngine(
+				getClusteringEngine(engineName), 1, params);
+		assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+	}
+
+	@Test
+	public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
+		ModifiableSolrParams params = new ModifiableSolrParams();
+		params.set("merge-resources", false);
+		params.set(AttributeUtils.getKey(
+				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+		"online,solrownstopword");
+
+		// "solrownstopword" is in stopwords.txt, so we're expecting
+		// only one cluster with label "online".
+		final List<NamedList<Object>> clusters = checkEngine(
+				getClusteringEngine("lexical-resource-check"), 1, params);
+		assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+	}
+
+	@Test
+	public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
+		ModifiableSolrParams params = new ModifiableSolrParams();
+		// Force string fields to be used for clustering. Does not make sense
+		// in a real word, but does the job in the test.
+		params.set(CarrotParams.TITLE_FIELD_NAME, "url");
+		params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
+		params.set("merge-resources", false);
+		params.set(AttributeUtils.getKey(
+				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+		"online,solrownstopword");
+
+		final List<NamedList<Object>> clusters = checkEngine(
+				getClusteringEngine("lexical-resource-check"), 2, params);
+		assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
+		assertEquals(ImmutableList.of("solrownstopword"),
+				getLabels(clusters.get(1)));
+	}
+
  private CarrotClusteringEngine getClusteringEngine(String engineName) {
    ClusteringComponent comp = (ClusteringComponent) h.getCore()
            .getSearchComponent("clustering");
@ -114,18 +203,18 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
    return engine;
  }

-  private List checkEngine(CarrotClusteringEngine engine,
+  private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
                            int expectedNumClusters) throws IOException {
    return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), new ModifiableSolrParams());
  }

-  private List checkEngine(CarrotClusteringEngine engine,
+  private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
                            int expectedNumClusters, SolrParams clusteringParams) throws IOException {
    return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), clusteringParams);
  }


-  private List checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
+  private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
                           int expectedNumClusters, Query query, SolrParams clusteringParams) throws IOException {
    // Get all documents to cluster
    RefCounted<SolrIndexSearcher> ref = h.getCore().getSearcher();
@ -145,7 +234,9 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
      LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
      Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(docList.size());
      SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
-      List results = (List)engine.cluster(query, solrDocList, docIds, req);
+
+      @SuppressWarnings("unchecked")
+			List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
      req.close();
      assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
      checkClusters(results, false);
@ -155,51 +246,74 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
    }
  }

-  private void checkClusters(List results, int expectedDocCount,
+  private void checkClusters(List<NamedList<Object>> results, int expectedDocCount,
                             int expectedLabelCount, int expectedSubclusterCount) {
    for (int i = 0; i < results.size(); i++) {
-      NamedList cluster = (NamedList) results.get(i);
+      NamedList<Object> cluster = results.get(i);
      checkCluster(cluster, expectedDocCount, expectedLabelCount,
              expectedSubclusterCount);
    }
  }

-  private void checkClusters(List results, boolean hasSubclusters) {
+  private void checkClusters(List<NamedList<Object>> results, boolean hasSubclusters) {
    for (int i = 0; i < results.size(); i++) {
-      checkCluster((NamedList) results.get(i), hasSubclusters);
+      checkCluster(results.get(i), hasSubclusters);
    }
  }

-  private void checkCluster(NamedList cluster, boolean hasSubclusters) {
-    List docs = (List) cluster.get("docs");
+  private void checkCluster(NamedList<Object> cluster, boolean hasSubclusters) {
+    List<Object> docs = getDocs(cluster);
    assertNotNull("docs is null and it shouldn't be", docs);
    for (int j = 0; j < docs.size(); j++) {
      String id = (String) docs.get(j);
      assertNotNull("id is null and it shouldn't be", id);
    }

-    List labels = (List) cluster.get("labels");
+    List<String> labels = getLabels(cluster);
    assertNotNull("labels is null but it shouldn't be", labels);

    if (hasSubclusters) {
-      List subclusters = (List) cluster.get("clusters");
+      List<NamedList<Object>> subclusters = getSubclusters(cluster);
      assertNotNull("subclusters is null but it shouldn't be", subclusters);
    }
  }

-  private void checkCluster(NamedList cluster, int expectedDocCount,
+  private void checkCluster(NamedList<Object> cluster, int expectedDocCount,
                            int expectedLabelCount, int expectedSubclusterCount) {
    checkCluster(cluster, expectedSubclusterCount > 0);
    assertEquals("number of docs in cluster", expectedDocCount,
-            ((List) cluster.get("docs")).size());
+            getDocs(cluster).size());
    assertEquals("number of labels in cluster", expectedLabelCount,
-            ((List) cluster.get("labels")).size());
+            getLabels(cluster).size());

    if (expectedSubclusterCount > 0) {
-      List subclusters = (List) cluster.get("clusters");
+      List<NamedList<Object>> subclusters = getSubclusters(cluster);
      assertEquals("numClusters", expectedSubclusterCount, subclusters.size());
      assertEquals("number of subclusters in cluster",
              expectedSubclusterCount, subclusters.size());
    }
  }
+
+	@SuppressWarnings("unchecked")
+	private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
+		return (List<NamedList<Object>>) cluster.get("clusters");
+	}
+
+	@SuppressWarnings("unchecked")
+	private List<String> getLabels(NamedList<Object> cluster) {
+		return (List<String>) cluster.get("labels");
+	}
+
+	private Double getScore(NamedList<Object> cluster) {
+	  return (Double) cluster.get("score");
+	}
+
+	private Boolean isOtherTopics(NamedList<Object> cluster) {
+	  return (Boolean)cluster.get("other-topics");
+	}
+
+	@SuppressWarnings("unchecked")
+	private List<Object> getDocs(NamedList<Object> cluster) {
+		return (List<Object>) cluster.get("docs");
+	}
 }
--- a/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java
+++ b/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java
@ -0,0 +1,82 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.util.List;
+
+import org.carrot2.core.Cluster;
+import org.carrot2.core.IClusteringAlgorithm;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.core.ProcessingComponentBase;
+import org.carrot2.core.ProcessingException;
+import org.carrot2.core.attribute.AttributeNames;
+import org.carrot2.core.attribute.Processing;
+import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
+import org.carrot2.text.linguistic.ILexicalData;
+import org.carrot2.text.linguistic.ILexicalDataFactory;
+import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
+import org.carrot2.text.util.MutableCharArray;
+import org.carrot2.util.attribute.Attribute;
+import org.carrot2.util.attribute.Bindable;
+import org.carrot2.util.attribute.Input;
+import org.carrot2.util.attribute.Output;
+
+import com.google.common.collect.Lists;
+
+/**
+ * A mock implementation of Carrot2 clustering algorithm for testing whether the
+ * customized lexical resource lookup works correctly. This algorithm ignores
+ * the input documents and instead for each word from {@link #wordsToCheck}, it
+ * outputs a cluster labeled with the word only if the word is neither a stop
+ * word nor a stop label.
+ */
+@Bindable(prefix = "LexicalResourcesCheckClusteringAlgorithm")
+public class LexicalResourcesCheckClusteringAlgorithm extends
+		ProcessingComponentBase implements IClusteringAlgorithm {
+
+	@Output
+	@Processing
+	@Attribute(key = AttributeNames.CLUSTERS)
+	private List<Cluster> clusters;
+
+	@Input
+	@Processing
+	@Attribute
+	private String wordsToCheck;
+
+	private BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
+
+	@Override
+	public void process() throws ProcessingException {
+		clusters = Lists.newArrayList();
+		if (wordsToCheck == null) {
+			return;
+		}
+
+		// Test with Maltese so that the English clustering performed in other tests
+		// is not affected by the test stopwords and stoplabels.
+		ILexicalData lexicalData = preprocessing.lexicalDataFactory
+				.getLexicalData(LanguageCode.MALTESE);
+
+		for (String word : wordsToCheck.split(",")) {
+			if (!lexicalData.isCommonWord(new MutableCharArray(word))
+					&& !lexicalData.isStopLabel(word)) {
+				clusters.add(new Cluster(word));
+			}
+		}
+	}
+}
--- a/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java
+++ b/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java
@ -49,6 +49,11 @@ public class MockClusteringAlgorithm extends ProcessingComponentBase implements
  @IntRange(min = 1, max = 5)
  private int labels = 1;

+  @Input
+  @Processing
+  @Attribute
+  private int otherTopicsModulo = 0;
+
  @Override
  public void process() throws ProcessingException {
    clusters = Lists.newArrayList();
@ -59,21 +64,26 @@ public class MockClusteringAlgorithm extends ProcessingComponentBase implements
    int documentIndex = 1;
    for (Document document : documents) {
      StringBuilder label = new StringBuilder("Cluster " + documentIndex);
-      Cluster cluster = createCluster(label.toString(), document);
+      Cluster cluster = createCluster(label.toString(), documentIndex, document);
      clusters.add(cluster);
      for (int i = 1; i <= depth; i++) {
        label.append(".");
        label.append(i);
-        Cluster newCluster = createCluster(label.toString(), document);
-        cluster.addSubclusters(createCluster(label.toString(), document), newCluster);
+        Cluster newCluster = createCluster(label.toString(), documentIndex, document);
+        cluster.addSubclusters(createCluster(label.toString(), documentIndex, document), newCluster);
        cluster = newCluster;
      }
      documentIndex++;
    }
  }

-  private Cluster createCluster(String labelBase, Document... documents) {
+  private Cluster createCluster(String labelBase, int documentIndex, Document... documents) {
    Cluster cluster = new Cluster();
+    cluster.setScore(documentIndex * 0.25);
+    if (otherTopicsModulo != 0 && documentIndex % otherTopicsModulo == 0)
+    {
+      cluster.setOtherTopics(true);
+    }
    for (int i = 0; i < labels; i++) {
      cluster.addPhrases(labelBase + "#" + (i + 1));
    }
--- a/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stoplabels.mt
+++ b/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stoplabels.mt
@ -0,0 +1 @@
+customsolrstoplabel
--- a/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stopwords.mt
+++ b/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stopwords.mt
@ -0,0 +1 @@
+customsolrstopword
--- a/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stoplabels.mt
+++ b/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stoplabels.mt
@ -0,0 +1 @@
+customsolrstoplabelcustomdir
--- a/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stopwords.mt
+++ b/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stopwords.mt
@ -0,0 +1 @@
+customsolrstopwordcustomdir
--- a/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml
+++ b/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml
@ -396,6 +396,15 @@
      <str name="name">mock</str>
      <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
    </lst>
+    <lst name="engine">
+      <str name="name">lexical-resource-check</str>
+      <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+    </lst>
+    <lst name="engine">
+      <str name="name">lexical-resource-check-custom-resource-dir</str>
+      <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+      <str name="carrot.lexicalResourcesDir">clustering/custom</str>
+    </lst>
  </searchComponent>

  <searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="doc-clustering">
--- a/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt
+++ b/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt
@ -55,4 +55,5 @@ to
 was
 will
 with
+solrownstopword

--- a/solr/example/solr/conf/solrconfig.xml
+++ b/solr/example/solr/conf/solrconfig.xml
@ -1198,17 +1198,20 @@
    <lst name="engine">
      <!-- The name, only one can be named "default" -->
      <str name="name">default</str>
+
      <!-- Class name of Carrot2 clustering algorithm.

           Currently available algorithms are:
           
           * org.carrot2.clustering.lingo.LingoClusteringAlgorithm
           * org.carrot2.clustering.stc.STCClusteringAlgorithm
+           * org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
           
           See http://project.carrot2.org/algorithms.html for the
           algorithm's characteristics.
        -->
      <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
+
      <!-- Overriding values for Carrot2 default algorithm attributes.

           For a description of all available attributes, see:
@ -1220,6 +1223,19 @@
        -->
      <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>

+      <!-- Location of Carrot2 lexical resources.
+
+           A directory from which to load Carrot2-specific stop words
+           and stop labels. Absolute or relative to Solr config directory.
+           If a specific resource (e.g. stopwords.en) is present in the
+           specified dir, it will completely override the corresponding
+           default one that ships with Carrot2.
+
+           For an overview of Carrot2 lexical resources, see:
+           http://download.carrot2.org/head/manual/#chapter.lexical-resources
+        -->
+      <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
+
      <!-- The language to assume for the documents.

           For a list of allowed values, see: