LUCENE-3622: merge trunk (1212397:1212829)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3622@1212830 13f79535-47bb-0310-9956-ffa450edef68
2011-12-10 18:05:08 +00:00 · 2011-12-10 18:05:08 +00:00 · 5627b52e70
parent d0453d820c 39c2ae5268
commit 5627b52e70
22 changed files with 962 additions and 519 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -217,7 +217,11 @@ Changes in backwards compatibility policy
  instances. Furthermore, IndexReader.setNorm() was removed. If you need
  customized norm values, the recommended way to do this is by modifying
  SimilarityProvider to use an external byte[] or one of the new DocValues
-  fields (LUCENE-3108).  (Uwe Schindler, Robert Muir)
+  fields (LUCENE-3108). Alternatively, to dynamically change norms (boost
+  *and* length norm) at query time, wrap your IndexReader using
+  FilterIndexReader, overriding FilterIndexReader.norms(). To persist the
+  changes on disk, copy the FilteredIndexReader to a new index using
+  IndexWriter.addIndexes().  (Uwe Schindler, Robert Muir)
  
 Changes in Runtime Behavior

@ -676,6 +680,22 @@ Security fixes
  prevents this as best as it can by throwing AlreadyClosedException
  also on clones.  (Uwe Schindler, Robert Muir)

+API Changes
+
+* LUCENE-3606: IndexReader will be made read-only in Lucene 4.0, so all
+  methods allowing to delete or undelete documents using IndexReader were
+  deprecated; you should use IndexWriter now. Consequently
+  IndexReader.commit() and all open(), openIfChanged(), clone() methods
+  taking readOnly booleans (or IndexDeletionPolicy instances) were
+  deprecated. IndexReader.setNorm() is superfluous and was deprecated.
+  If you have to change per-document boost use CustomScoreQuery.
+  If you want to dynamically change norms (boost *and* length norm) at
+  query time, wrap your IndexReader using FilterIndexReader, overriding
+  FilterIndexReader.norms(). To persist the changes on disk, copy the
+  FilteredIndexReader to a new index using IndexWriter.addIndexes().
+  In Lucene 4.0, SimilarityProvider will allow you to customize scoring
+  using external norms, too.  (Uwe Schindler, Robert Muir)
+
 New Features

 * LUCENE-3593: Added a FieldValueFilter that accepts all documents that either
@ -691,6 +711,12 @@ Bug fixes
 * LUCENE-3627: Don't let an errant 0-byte segments_N file corrupt the index.
  (Ken McCracken via Mike McCandless)

+* LUCENE-3630: The internal method MultiReader.doOpenIfChanged(boolean doClone)
+  was overriding IndexReader.doOpenIfChanged(boolean readOnly), so changing the
+  contract of the overridden method. This method was renamed and made private.
+  In ParallelReader the bug was not existent, but the implementation method
+  was also made private.  (Uwe Schindler)
+
 Documentation

 * LUCENE-3597: Fixed incorrect grouping documentation. (Martijn van Groningen, Robert Muir)
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -76,6 +76,11 @@ API Changes
 * LUCENE-3308: DuplicateFilter keepMode and processingMode have been converted to
   enums DuplicateFilter.KeepMode and DuplicateFilter.ProcessingMode repsectively.
   
+* LUCENE-3606: FieldNormModifier was deprecated, because IndexReader's
+  setNorm() was deprecated. Furthermore, this class is broken, as it does
+  not take position overlaps into account while recalculating norms.
+  (Uwe Schindler, Robert Muir)
+
 Bug Fixes

 * LUCENE-3045: fixed QueryNodeImpl.containsTag(String key) that was
--- a/lucene/src/java/org/apache/lucene/index/MultiReader.java
+++ b/lucene/src/java/org/apache/lucene/index/MultiReader.java
@ -108,7 +108,7 @@ public class MultiReader extends IndexReader implements Cloneable {
   */
  @Override
  protected synchronized IndexReader doOpenIfChanged() throws CorruptIndexException, IOException {
-    return doOpenIfChanged(false);
+    return doReopen(false);
  }
  
  /**
@ -123,7 +123,7 @@ public class MultiReader extends IndexReader implements Cloneable {
  @Override
  public synchronized Object clone() {
    try {
-      return doOpenIfChanged(true);
+      return doReopen(true);
    } catch (Exception ex) {
      throw new RuntimeException(ex);
    }
@ -141,7 +141,7 @@ public class MultiReader extends IndexReader implements Cloneable {
   * @throws CorruptIndexException
   * @throws IOException
   */
-  protected IndexReader doOpenIfChanged(boolean doClone) throws CorruptIndexException, IOException {
+  private IndexReader doReopen(boolean doClone) throws CorruptIndexException, IOException {
    ensureOpen();
    
    boolean changed = false;
--- a/lucene/src/java/org/apache/lucene/index/ParallelReader.java
+++ b/lucene/src/java/org/apache/lucene/index/ParallelReader.java
@ -243,7 +243,7 @@ public class ParallelReader extends IndexReader {
    return doReopen(false);
  }
    
-  protected IndexReader doReopen(boolean doClone) throws CorruptIndexException, IOException {
+  private IndexReader doReopen(boolean doClone) throws CorruptIndexException, IOException {
    ensureOpen();
    
    boolean reopened = false;
--- a/lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java
+++ b/lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java
@ -30,11 +30,37 @@ import org.apache.lucene.util.LuceneTestCase;
 */
 public class TestIndexReaderClone extends LuceneTestCase {

-  private void assertDelDocsRefCountEquals(int refCount, SegmentReader reader) {
-    assertEquals(refCount, reader.liveDocsRef.get());
+  public void testDirectoryReader() throws Exception {
+    final Directory dir = createIndex(0);
+    performDefaultTests(IndexReader.open(dir));
+    dir.close();
  }
-
-  public void testCloseStoredFields() throws Exception {
+  
+  public void testMultiReader() throws Exception {
+    final Directory dir1 = createIndex(0);
+    final IndexReader r1 = IndexReader.open(dir1);
+    final Directory dir2 = createIndex(0);
+    final IndexReader r2 = IndexReader.open(dir2);
+    final MultiReader mr = new MultiReader(r1, r2);
+    performDefaultTests(mr);
+    dir1.close();
+    dir2.close();
+  }
+  
+  public void testParallelReader() throws Exception {
+    final Directory dir1 = createIndex(0);
+    final IndexReader r1 = IndexReader.open(dir1);
+    final Directory dir2 = createIndex(1);
+    final IndexReader r2 = IndexReader.open(dir2);
+    final ParallelReader pr = new ParallelReader();
+    pr.add(r1);
+    pr.add(r2);
+    performDefaultTests(pr);
+    dir1.close();
+    dir2.close();
+  }
+  
+  private Directory createIndex(int no) throws Exception {
    final Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(
        dir,
@ -42,13 +68,19 @@ public class TestIndexReaderClone extends LuceneTestCase {
            setMergePolicy(newLogMergePolicy(false))
    );
    Document doc = new Document();
-    doc.add(newField("field", "yes it's stored", TextField.TYPE_STORED));
+    doc.add(newField("field"+no, "yes it's stored", TextField.TYPE_STORED));
    w.addDocument(doc);
    w.close();
-    IndexReader r1 = IndexReader.open(dir);
+    return dir;
+  }
+
+  private void performDefaultTests(IndexReader r1) throws Exception {
    IndexReader r2 = (IndexReader) r1.clone();
+    assertTrue(r1 != r2);
+    TestIndexReader.assertIndexEquals(r1, r2);
    r1.close();
    r2.close();
-    dir.close();
+    TestIndexReaderReopen.assertReaderClosed(r1, true, true);
+    TestIndexReaderReopen.assertReaderClosed(r2, true, true);
  }
 }
--- a/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java
+++ b/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java
@ -757,7 +757,7 @@ public class TestIndexReaderReopen extends LuceneTestCase {
    }
  }  
  
-  private void assertReaderClosed(IndexReader reader, boolean checkSubReaders, boolean checkNormsClosed) {
+  static void assertReaderClosed(IndexReader reader, boolean checkSubReaders, boolean checkNormsClosed) {
    assertEquals(0, reader.getRefCount());
    
    if (checkNormsClosed && reader instanceof SegmentReader) {
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -195,6 +195,8 @@ New Features
  "multiterm" analyzer in our schema.xml, but Solr should "do the right thing" if you don't
  specify <fieldType="multiterm"> (Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)

+* SOLR-2481: Add support for commitWithin in DataImportHandler (Sami Siren via yonik)
+

 Optimizations
 ----------------------
@ -409,6 +411,7 @@ New Features
 * SOLR-2919: Added support for localized range queries when the analysis chain uses 
  CollationKeyFilter or ICUCollationKeyFilter.  (Michael Sokolov, rmuir)

+
 Bug Fixes
 ----------------------
 * SOLR-2912: Fixed File descriptor leak in ShowFileRequestHandler (Michael Ryan, shalin)
@ -418,6 +421,9 @@ Bug Fixes
  
 * SOLR-2509: StringIndexOutOfBoundsException in the spellchecker collate when the term contains
  a hyphen. (Thomas Gambier caught the bug, Steffen Godskesen did the patch, via Erick Erickson)
+
+* SOLR-2955: Fixed IllegalStateException when querying with group.sort=score desc in sharded
+             environment. (Steffen Elberg Godskesen, Martijn van Groningen)
  
 Other Changes
 ----------------------
--- a/solr/contrib/clustering/CHANGES.txt
+++ b/solr/contrib/clustering/CHANGES.txt
@ -13,7 +13,27 @@ $Id$

 ================== Release 3.6.0 ==================

-(No Changes)
+* SOLR-2937: Configuring the number of contextual snippets used for 
+  search results clustering. The hl.snippets parameter is now respected
+  by the clustering plugin, can be overridden by carrot.summarySnippets
+  if needed (Stanislaw Osinski).
+
+* SOLR-2938: Clustering on multiple fields. The carrot.title and 
+  carrot.snippet can now take comma- or space-separated lists of
+  field names to cluster (Stanislaw Osinski).
+
+* SOLR-2939: Clustering of multilingual search results. The document's
+  language field be passed in the carrot.lang parameter, the carrot.lcmap
+  parameter enables mapping of language codes to ISO 639 (Stanislaw Osinski).
+
+* SOLR-2940: Passing values for custom Carrot2 fields. The custom field
+  mapping are defined using the carrot.custom parameter (Stanislaw Osinski).
+
+* SOLR-2941: NullPointerException on clustering component initialization 
+  when schema does not have a unique key field (Stanislaw Osinski).
+
+* SOLR-2942: ClassCastException when passing non-textual fields for 
+  clustering (Stanislaw Osinski).

 ================== Release 3.5.0 ==================

@ -21,10 +41,10 @@ $Id$

 ================== Release 3.4.0 ==================

-SOLR-2706: The carrot.lexicalResourcesDir parameter now works 
-   with absolute directories (Stanislaw Osinski)
+* SOLR-2706: The carrot.lexicalResourcesDir parameter now works 
+  with absolute directories (Stanislaw Osinski)
  
-SOLR-2692: Typo in param name fixed: "carrot.fragzise" changed to 
+* SOLR-2692: Typo in param name fixed: "carrot.fragzise" changed to 
  "carrot.fragSize" (Stanislaw Osinski).

 ================== Release 3.3.0 ==================
--- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
+++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
@ -19,15 +19,18 @@ package org.apache.solr.handler.clustering.carrot2;

 import java.io.*;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Set;

 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.ObjectUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.lucene.search.Query;
 import org.apache.solr.common.SolrDocument;
@ -45,6 +48,7 @@ import org.apache.solr.handler.component.HighlightComponent;
 import org.apache.solr.highlight.SolrHighlighter;
 import org.apache.solr.request.LocalSolrQueryRequest;
 import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.schema.SchemaField;
 import org.apache.solr.search.DocList;
 import org.apache.solr.search.DocSlice;
 import org.apache.solr.search.SolrIndexSearcher;
@ -54,6 +58,7 @@ import org.carrot2.core.Controller;
 import org.carrot2.core.ControllerFactory;
 import org.carrot2.core.Document;
 import org.carrot2.core.IClusteringAlgorithm;
+import org.carrot2.core.LanguageCode;
 import org.carrot2.core.attribute.AttributeNames;
 import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
 import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
@ -77,13 +82,13 @@ import com.google.common.io.Closeables;
 * @see "http://project.carrot2.org"
 */
 public class CarrotClusteringEngine extends SearchClusteringEngine {
-	private transient static Logger log = LoggerFactory
+  private transient static Logger log = LoggerFactory
          .getLogger(CarrotClusteringEngine.class);

-	/**
-	 * The subdirectory in Solr config dir to read customized Carrot2 resources from.
-	 */
-	private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
+  /**
+   * The subdirectory in Solr config dir to read customized Carrot2 resources from.
+   */
+  private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";

  /**
   * Name of Carrot2 document's field containing Solr document's identifier.
@ -102,7 +107,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
   */
  private Controller controller = ControllerFactory.createPooling();
  private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
-  
+
  private static class SolrResourceLocator implements IResourceLocator {
    private final SolrResourceLoader resourceLoader;
    private final String carrot2ResourcesDir;
@ -227,8 +232,8 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
    }
  }

-	@Override
-	@SuppressWarnings({ "unchecked", "rawtypes" })
+  @Override
+  @SuppressWarnings({ "unchecked", "rawtypes" })
  public String init(NamedList config, final SolrCore core) {
    String result = super.init(config, core);
    final SolrParams initParams = SolrParams.toSolrParams(config);
@ -243,13 +248,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
    // Additionally, we set a custom lexical resource factory for Carrot2 that
    // will use both Carrot2 default stop words as well as stop words from
    // the StopFilter defined on the field.
-		BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
-				.stemmerFactory(LuceneCarrot2StemmerFactory.class)
-				.tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
-				.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
+    BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
+        .stemmerFactory(LuceneCarrot2StemmerFactory.class)
+        .tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
+        .lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);

-		// Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
-		initAttributes.put("solrIndexSchema", core.getSchema());
+    // Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
+    initAttributes.put("solrIndexSchema", core.getSchema());

    // Customize Carrot2's resource lookup to first look for resources
    // using Solr's resource loader. If that fails, try loading from the classpath.
@ -261,7 +266,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
        new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));

    this.controller.init(initAttributes);
-    this.idFieldName = core.getSchema().getUniqueKeyField().getName();
+    
+    SchemaField uniqueField = core.getSchema().getUniqueKeyField();
+    if (uniqueField == null) {
+      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, 
+          CarrotClusteringEngine.class.getSimpleName() + " requires the schema to have a uniqueKeyField");
+    }
+    this.idFieldName = uniqueField.getName();

    // Make sure the requested Carrot2 clustering algorithm class is available
    String carrotAlgorithmClassName = initParams.get(CarrotParams.ALGORITHM);
@ -283,25 +294,35 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
    HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
    fields.add(idFieldName);
    fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
-		return fields;
+    fields.addAll(getCustomFieldsMap(solrParams).keySet());
+
+    String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME);
+    if (StringUtils.isNotBlank(languageField)) { 
+      fields.add(languageField);
+    }
+    return fields;
  }

-	/**
-	 * Returns the names of fields that will be delivering the actual
-	 * content for clustering. Currently, there are two such fields: document
-	 * title and document content.
-	 */
-	private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
+  /**
+   * Returns the names of fields that will be delivering the actual
+   * content for clustering. Currently, there are two such fields: document
+   * title and document content.
+   */
+  private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
    SolrParams solrParams = sreq.getParams();

-    String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
-    String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
-    if (StringUtils.isBlank(snippetField)) {
+    String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
+    String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
+    if (StringUtils.isBlank(snippetFieldSpec)) {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
              + " must not be blank.");
    }
-    return Sets.newHashSet(titleField, snippetField);
-	}
+    
+    final Set<String> fields = Sets.newHashSet();
+    fields.addAll(Arrays.asList(titleFieldSpec.split("[, ]")));
+    fields.addAll(Arrays.asList(snippetFieldSpec.split("[, ]")));
+    return fields;
+  }

  /**
   * Prepares Carrot2 documents for clustering.
@ -313,8 +334,27 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
    SolrCore core = sreq.getCore();

    String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
-    String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
-    String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
+    String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
+    String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
+    String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME, null);
+    
+    // Maps Solr field names to Carrot2 custom field names
+    Map<String, String> customFields = getCustomFieldsMap(solrParams);
+
+    // Parse language code map string into a map
+    Map<String, String> languageCodeMap = Maps.newHashMap();
+    if (StringUtils.isNotBlank(languageField)) {
+      for (String pair : solrParams.get(CarrotParams.LANGUAGE_CODE_MAP, "")
+          .split("[, ]")) {
+        final String[] split = pair.split(":");
+        if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
+          languageCodeMap.put(split[0], split[1]);
+        } else {
+          log.warn("Unsupported format for " + CarrotParams.LANGUAGE_CODE_MAP
+              + ": '" + pair + "'. Skipping this mapping.");
+        }
+      }
+    }
    
    // Get the documents
    boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY, false);
@ -325,12 +365,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
      highlighter = HighlightComponent.getHighlighter(core);
      if (highlighter != null){
        Map<String, Object> args = Maps.newHashMap();
-        snippetFieldAry = new String[]{snippetField};
+        snippetFieldAry = snippetFieldSpec.split("[, ]");
        args.put(HighlightParams.FIELDS, snippetFieldAry);
        args.put(HighlightParams.HIGHLIGHT, "true");
        args.put(HighlightParams.SIMPLE_PRE, ""); //we don't care about actually highlighting the area
        args.put(HighlightParams.SIMPLE_POST, "");
        args.put(HighlightParams.FRAGSIZE, solrParams.getInt(CarrotParams.SUMMARY_FRAGSIZE, solrParams.getInt(HighlightParams.FRAGSIZE, 100)));
+        args.put(HighlightParams.SNIPPETS, solrParams.getInt(CarrotParams.SUMMARY_SNIPPETS, solrParams.getInt(HighlightParams.SNIPPETS, 1)));
        req = new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) {
          @Override
          public SolrIndexSearcher getSearcher() {
@ -352,7 +393,8 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {

    while (docsIter.hasNext()) {
      SolrDocument sdoc = docsIter.next();
-      String snippet = getValue(sdoc, snippetField);
+      String snippet = null;
+      
      // TODO: docIds will be null when running distributed search.
      // See comment in ClusteringComponent#finishStage().
      if (produceSummary && docIds != null) {
@ -360,34 +402,115 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
        DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
        NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
        if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
-          //should only be one document with one field
+          //should only be one document
          @SuppressWarnings("unchecked")
-					NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
-          String [] highlt = tmp.get(snippetField);
-          if (highlt != null && highlt.length == 1) {
-            snippet = highlt[0];
+          NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
+          
+          final StringBuilder sb = new StringBuilder();
+          for (int j = 0; j < snippetFieldAry.length; j++) {
+            // Join fragments with a period, so that Carrot2 does not create
+            // cross-fragment phrases, such phrases rarely make sense.
+            String [] highlt = tmp.get(snippetFieldAry[j]);
+            if (highlt != null && highlt.length > 0) {
+              for (int i = 0; i < highlt.length; i++) {
+                sb.append(highlt[i]);
+                sb.append(" . ");
+              }
+            }
+          }
+          snippet = sb.toString();
+        }
+      }
+      
+      // If summaries not enabled or summary generation failed, use full content.
+      if (snippet == null) {
+        snippet = getConcatenated(sdoc, snippetFieldSpec);
+      }
+      
+      // Create a Carrot2 document
+      Document carrotDocument = new Document(getConcatenated(sdoc, titleFieldSpec),
+              snippet, ObjectUtils.toString(sdoc.getFieldValue(urlField), ""));
+      
+      // Store Solr id of the document, we need it to map document instances 
+      // found in clusters back to identifiers.
+      carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
+      
+      // Set language
+      if (StringUtils.isNotBlank(languageField)) {
+        Collection<Object> languages = sdoc.getFieldValues(languageField);
+        if (languages != null) {
+          
+          // Use the first Carrot2-supported language
+          for (Object l : languages) {
+            String lang = ObjectUtils.toString(l, "");
+            
+            if (languageCodeMap.containsKey(lang)) {
+              lang = languageCodeMap.get(lang);
+            }
+            
+            // Language detection Library for Java uses dashes to separate
+            // language variants, such as 'zh-cn', but Carrot2 uses underscores.
+            if (lang.indexOf('-') > 0) {
+              lang = lang.replace('-', '_');
+            }
+            
+            // If the language is supported by Carrot2, we'll get a non-null value
+            final LanguageCode carrot2Language = LanguageCode.forISOCode(lang);
+            if (carrot2Language != null) {
+              carrotDocument.setLanguage(carrot2Language);
+              break;
+            }
          }
        }
      }
-      Document carrotDocument = new Document(getValue(sdoc, titleField),
-              snippet, (String)sdoc.getFieldValue(urlField));
-      carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
+      
+      // Add custom fields
+      if (customFields != null) {
+        for (Entry<String, String> entry : customFields.entrySet()) {
+          carrotDocument.setField(entry.getValue(), sdoc.getFieldValue(entry.getKey()));
+        }
+      }
+      
      result.add(carrotDocument);
    }

    return result;
  }

-  protected String getValue(SolrDocument sdoc, String field) {
+  /**
+   * Prepares a map of Solr field names (keys) to the corresponding Carrot2
+   * custom field names.
+   */
+  private Map<String, String> getCustomFieldsMap(SolrParams solrParams) {
+    Map<String, String> customFields = Maps.newHashMap();
+    String [] customFieldsSpec = solrParams.getParams(CarrotParams.CUSTOM_FIELD_NAME);
+    if (customFieldsSpec != null) {
+      customFields = Maps.newHashMap();
+      for (String customFieldSpec : customFieldsSpec) {
+        String [] split = customFieldSpec.split(":"); 
+        if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
+          customFields.put(split[0], split[1]);
+        } else {
+          log.warn("Unsupported format for " + CarrotParams.CUSTOM_FIELD_NAME
+              + ": '" + customFieldSpec + "'. Skipping this field definition.");
+        }
+      }
+    }
+    return customFields;
+  }
+
+  private String getConcatenated(SolrDocument sdoc, String fieldsSpec) {
    StringBuilder result = new StringBuilder();
-    Collection<Object> vals = sdoc.getFieldValues(field);
-    if(vals == null) return "";
-    Iterator<Object> ite = vals.iterator();
-    while(ite.hasNext()){
-      // Join multiple values with a period so that Carrot2 does not pick up
-      // phrases that cross field value boundaries (in most cases it would
-      // create useless phrases).
-      result.append((String)ite.next()).append(" . ");
+    for (String field : fieldsSpec.split("[, ]")) {
+      Collection<Object> vals = sdoc.getFieldValues(field);
+      if (vals == null) continue;
+      Iterator<Object> ite = vals.iterator();
+      while(ite.hasNext()){
+        // Join multiple values with a period so that Carrot2 does not pick up
+        // phrases that cross field value boundaries (in most cases it would
+        // create useless phrases).
+        result.append(ObjectUtils.toString(ite.next())).append(" . ");
+      }
    }
    return result.toString().trim();
  }
--- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
+++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
@ -27,17 +27,24 @@ public interface CarrotParams {
  String CARROT_PREFIX = "carrot.";

  String ALGORITHM = CARROT_PREFIX + "algorithm";
+  
  String TITLE_FIELD_NAME = CARROT_PREFIX + "title";
  String URL_FIELD_NAME = CARROT_PREFIX + "url";
  String SNIPPET_FIELD_NAME = CARROT_PREFIX + "snippet";
+  String LANGUAGE_FIELD_NAME = CARROT_PREFIX + "lang";
+  String CUSTOM_FIELD_NAME = CARROT_PREFIX + "custom";
+  
  String PRODUCE_SUMMARY = CARROT_PREFIX + "produceSummary";
+  String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragSize";
+  String SUMMARY_SNIPPETS = CARROT_PREFIX + "summarySnippets";
+
  String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions";
  String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
-  String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragSize";
-
  String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
+  String LANGUAGE_CODE_MAP = CARROT_PREFIX + "lcmap";

  public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
-          ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
-          PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, SUMMARY_FRAGSIZE);
+          ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME, LANGUAGE_FIELD_NAME,
+          PRODUCE_SUMMARY, SUMMARY_FRAGSIZE, SUMMARY_SNIPPETS, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, 
+          LEXICAL_RESOURCES_DIR);
 }
--- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java
+++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java
@ -50,192 +50,192 @@ import org.tartarus.snowball.ext.TurkishStemmer;
 * in this class.
 */
 public class LuceneCarrot2StemmerFactory implements IStemmerFactory {
-	final static Logger logger = org.slf4j.LoggerFactory
-			.getLogger(LuceneCarrot2StemmerFactory.class);
+  final static Logger logger = org.slf4j.LoggerFactory
+      .getLogger(LuceneCarrot2StemmerFactory.class);

-	@Override
-	public IStemmer getStemmer(LanguageCode language) {
-		switch (language) {
-		case ARABIC:
-			return ArabicStemmerFactory.createStemmer();
+  @Override
+  public IStemmer getStemmer(LanguageCode language) {
+    switch (language) {
+    case ARABIC:
+      return ArabicStemmerFactory.createStemmer();

-		case CHINESE_SIMPLIFIED:
-			return IdentityStemmer.INSTANCE;
+    case CHINESE_SIMPLIFIED:
+      return IdentityStemmer.INSTANCE;

-		default:
-			/*
-			 * For other languages, try to use snowball's stemming.
-			 */
-			return SnowballStemmerFactory.createStemmer(language);
-		}
-	}
+    default:
+      /*
+       * For other languages, try to use snowball's stemming.
+       */
+      return SnowballStemmerFactory.createStemmer(language);
+    }
+  }

-	/**
-	 * Factory of {@link IStemmer} implementations from the <code>snowball</code>
-	 * project.
-	 */
-	private final static class SnowballStemmerFactory {
-		/**
-		 * Static hard mapping from language codes to stemmer classes in Snowball.
-		 * This mapping is not dynamic because we want to keep the possibility to
-		 * obfuscate these classes.
-		 */
-		private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
-		static {
-			snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
-			snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
-			snowballStemmerClasses
-					.put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
-			snowballStemmerClasses
-					.put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
-					PortugueseStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
-			snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
-		}
+  /**
+   * Factory of {@link IStemmer} implementations from the <code>snowball</code>
+   * project.
+   */
+  private final static class SnowballStemmerFactory {
+    /**
+     * Static hard mapping from language codes to stemmer classes in Snowball.
+     * This mapping is not dynamic because we want to keep the possibility to
+     * obfuscate these classes.
+     */
+    private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
+    static {
+      snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
+      snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
+      snowballStemmerClasses
+          .put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
+      snowballStemmerClasses
+          .put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
+          PortugueseStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
+      snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
+    }

-		/**
-		 * An adapter converting Snowball programs into {@link IStemmer} interface.
-		 */
-		private static class SnowballStemmerAdapter implements IStemmer {
-			private final SnowballProgram snowballStemmer;
+    /**
+     * An adapter converting Snowball programs into {@link IStemmer} interface.
+     */
+    private static class SnowballStemmerAdapter implements IStemmer {
+      private final SnowballProgram snowballStemmer;

-			public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
-				this.snowballStemmer = snowballStemmer;
-			}
+      public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
+        this.snowballStemmer = snowballStemmer;
+      }

-			public CharSequence stem(CharSequence word) {
-				snowballStemmer.setCurrent(word.toString());
-				if (snowballStemmer.stem()) {
-					return snowballStemmer.getCurrent();
-				} else {
-					return null;
-				}
-			}
-		}
+      public CharSequence stem(CharSequence word) {
+        snowballStemmer.setCurrent(word.toString());
+        if (snowballStemmer.stem()) {
+          return snowballStemmer.getCurrent();
+        } else {
+          return null;
+        }
+      }
+    }

-		/**
-		 * Create and return an {@link IStemmer} adapter for a
-		 * {@link SnowballProgram} for a given language code. An identity stemmer is
-		 * returned for unknown languages.
-		 */
-		public static IStemmer createStemmer(LanguageCode language) {
-			final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
-					.get(language);
+    /**
+     * Create and return an {@link IStemmer} adapter for a
+     * {@link SnowballProgram} for a given language code. An identity stemmer is
+     * returned for unknown languages.
+     */
+    public static IStemmer createStemmer(LanguageCode language) {
+      final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
+          .get(language);

-			if (stemmerClazz == null) {
-				logger.warn("No Snowball stemmer class for: " + language.name()
-						+ ". Quality of clustering may be degraded.");
-				return IdentityStemmer.INSTANCE;
-			}
+      if (stemmerClazz == null) {
+        logger.warn("No Snowball stemmer class for: " + language.name()
+            + ". Quality of clustering may be degraded.");
+        return IdentityStemmer.INSTANCE;
+      }

-			try {
-				return new SnowballStemmerAdapter(stemmerClazz.newInstance());
-			} catch (Exception e) {
-				logger.warn("Could not instantiate snowball stemmer"
-						+ " for language: " + language.name()
-						+ ". Quality of clustering may be degraded.", e);
+      try {
+        return new SnowballStemmerAdapter(stemmerClazz.newInstance());
+      } catch (Exception e) {
+        logger.warn("Could not instantiate snowball stemmer"
+            + " for language: " + language.name()
+            + ". Quality of clustering may be degraded.", e);

-				return IdentityStemmer.INSTANCE;
-			}
-		}
-	}
+        return IdentityStemmer.INSTANCE;
+      }
+    }
+  }

-	/**
-	 * Factory of {@link IStemmer} implementations for the
-	 * {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
-	 * to be present in classpath, otherwise an empty (identity) stemmer is
-	 * returned.
-	 */
-	private static class ArabicStemmerFactory {
-		static {
-			try {
-				ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
-				ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
-			} catch (ClassNotFoundException e) {
-				logger
-						.warn(
-								"Could not instantiate Lucene stemmer for Arabic, clustering quality "
-										+ "of Arabic content may be degraded. For best quality clusters, "
-										+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
-								e);
-			}
-		}
+  /**
+   * Factory of {@link IStemmer} implementations for the
+   * {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
+   * to be present in classpath, otherwise an empty (identity) stemmer is
+   * returned.
+   */
+  private static class ArabicStemmerFactory {
+    static {
+      try {
+        ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
+        ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
+      } catch (ClassNotFoundException e) {
+        logger
+            .warn(
+                "Could not instantiate Lucene stemmer for Arabic, clustering quality "
+                    + "of Arabic content may be degraded. For best quality clusters, "
+                    + "make sure Lucene's Arabic analyzer JAR is in the classpath",
+                e);
+      }
+    }

-		/**
-		 * Adapter to lucene-contrib Arabic analyzers.
-		 */
-		private static class LuceneStemmerAdapter implements IStemmer {
-			private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
-			private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
+    /**
+     * Adapter to lucene-contrib Arabic analyzers.
+     */
+    private static class LuceneStemmerAdapter implements IStemmer {
+      private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
+      private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;

-			private char[] buffer = new char[0];
+      private char[] buffer = new char[0];

-			private LuceneStemmerAdapter() throws Exception {
-				delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
-				normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
-			}
+      private LuceneStemmerAdapter() throws Exception {
+        delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
+        normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
+      }

-			public CharSequence stem(CharSequence word) {
-				if (word.length() > buffer.length) {
-					buffer = new char[word.length()];
-				}
+      public CharSequence stem(CharSequence word) {
+        if (word.length() > buffer.length) {
+          buffer = new char[word.length()];
+        }

-				for (int i = 0; i < word.length(); i++) {
-					buffer[i] = word.charAt(i);
-				}
+        for (int i = 0; i < word.length(); i++) {
+          buffer[i] = word.charAt(i);
+        }

-				int newLen = normalizer.normalize(buffer, word.length());
-				newLen = delegate.stem(buffer, newLen);
+        int newLen = normalizer.normalize(buffer, word.length());
+        newLen = delegate.stem(buffer, newLen);

-				if (newLen != word.length() || !equals(buffer, newLen, word)) {
-					return CharBuffer.wrap(buffer, 0, newLen);
-				}
+        if (newLen != word.length() || !equals(buffer, newLen, word)) {
+          return CharBuffer.wrap(buffer, 0, newLen);
+        }

-				// Same-same.
-				return null;
-			}
+        // Same-same.
+        return null;
+      }

-			private boolean equals(char[] buffer, int len, CharSequence word) {
-				assert len == word.length();
+      private boolean equals(char[] buffer, int len, CharSequence word) {
+        assert len == word.length();

-				for (int i = 0; i < len; i++) {
-					if (buffer[i] != word.charAt(i))
-						return false;
-				}
+        for (int i = 0; i < len; i++) {
+          if (buffer[i] != word.charAt(i))
+            return false;
+        }

-				return true;
-			}
-		}
+        return true;
+      }
+    }

-		public static IStemmer createStemmer() {
-			try {
-				return new LuceneStemmerAdapter();
-			} catch (Throwable e) {
-				return IdentityStemmer.INSTANCE;
-			}
-		}
-	}
+    public static IStemmer createStemmer() {
+      try {
+        return new LuceneStemmerAdapter();
+      } catch (Throwable e) {
+        return IdentityStemmer.INSTANCE;
+      }
+    }
+  }

-	/**
-	 * An implementation of {@link IStemmer} that always returns <code>null</code>
-	 * which means no stemming.
-	 */
-	private static class IdentityStemmer implements IStemmer {
-		private final static IdentityStemmer INSTANCE = new IdentityStemmer();
+  /**
+   * An implementation of {@link IStemmer} that always returns <code>null</code>
+   * which means no stemming.
+   */
+  private static class IdentityStemmer implements IStemmer {
+    private final static IdentityStemmer INSTANCE = new IdentityStemmer();

-		@Override
-		public CharSequence stem(CharSequence word) {
-			return null;
-		}
-	}
+    @Override
+    public CharSequence stem(CharSequence word) {
+      return null;
+    }
+  }
 }
--- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java
+++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java
@ -40,117 +40,117 @@ import org.slf4j.Logger;
 * Lucene APIs need to change, the changes can be made in this class.
 */
 public class LuceneCarrot2TokenizerFactory implements ITokenizerFactory {
-	final static Logger logger = org.slf4j.LoggerFactory
-			.getLogger(LuceneCarrot2TokenizerFactory.class);
+  final static Logger logger = org.slf4j.LoggerFactory
+      .getLogger(LuceneCarrot2TokenizerFactory.class);

-	@Override
-	public ITokenizer getTokenizer(LanguageCode language) {
-		switch (language) {
-		case CHINESE_SIMPLIFIED:
-			return ChineseTokenizerFactory.createTokenizer();
+  @Override
+  public ITokenizer getTokenizer(LanguageCode language) {
+    switch (language) {
+    case CHINESE_SIMPLIFIED:
+      return ChineseTokenizerFactory.createTokenizer();

-			/*
-			 * We use our own analyzer for Arabic. Lucene's version has special
-			 * support for Nonspacing-Mark characters (see
-			 * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
-			 * have them included as letters in the parser.
-			 */
-		case ARABIC:
-			// Intentional fall-through.
+      /*
+       * We use our own analyzer for Arabic. Lucene's version has special
+       * support for Nonspacing-Mark characters (see
+       * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
+       * have them included as letters in the parser.
+       */
+    case ARABIC:
+      // Intentional fall-through.

-		default:
-			return new ExtendedWhitespaceTokenizer();
-		}
-	}
+    default:
+      return new ExtendedWhitespaceTokenizer();
+    }
+  }

-	/**
-	 * Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
-	 * {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
-	 * factory will fall back to the default white space tokenizer.
-	 */
-	private static final class ChineseTokenizerFactory {
-		static {
-			try {
-				ReflectionUtils.classForName(
-						"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
-				ReflectionUtils.classForName(
-						"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
-			} catch (Throwable e) {
-				logger
-						.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
-								+ "of Chinese content may be degraded. For best quality clusters, "
-								+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
-			}
-		}
+  /**
+   * Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
+   * {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
+   * factory will fall back to the default white space tokenizer.
+   */
+  private static final class ChineseTokenizerFactory {
+    static {
+      try {
+        ReflectionUtils.classForName(
+            "org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+        ReflectionUtils.classForName(
+            "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+      } catch (Throwable e) {
+        logger
+            .warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
+                + "of Chinese content may be degraded. For best quality clusters, "
+                + "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
+      }
+    }

-		static ITokenizer createTokenizer() {
-			try {
-				return new ChineseTokenizer();
-			} catch (Throwable e) {
-				return new ExtendedWhitespaceTokenizer();
-			}
-		}
+    static ITokenizer createTokenizer() {
+      try {
+        return new ChineseTokenizer();
+      } catch (Throwable e) {
+        return new ExtendedWhitespaceTokenizer();
+      }
+    }

-		private final static class ChineseTokenizer implements ITokenizer {
-			private final static Pattern numeric = Pattern
-					.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
+    private final static class ChineseTokenizer implements ITokenizer {
+      private final static Pattern numeric = Pattern
+          .compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");

-			private Tokenizer sentenceTokenizer;
-			private TokenStream wordTokenFilter;
-			private CharTermAttribute term = null;
+      private Tokenizer sentenceTokenizer;
+      private TokenStream wordTokenFilter;
+      private CharTermAttribute term = null;

-			private final MutableCharArray tempCharSequence;
-			private final Class<?> tokenFilterClass;
+      private final MutableCharArray tempCharSequence;
+      private final Class<?> tokenFilterClass;

-			private ChineseTokenizer() throws Exception {
-				this.tempCharSequence = new MutableCharArray(new char[0]);
+      private ChineseTokenizer() throws Exception {
+        this.tempCharSequence = new MutableCharArray(new char[0]);

-				// As Smart Chinese is not available during compile time,
-				// we need to resort to reflection.
-				final Class<?> tokenizerClass = ReflectionUtils.classForName(
-						"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
-				this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
-						Reader.class).newInstance((Reader) null);
-				this.tokenFilterClass = ReflectionUtils.classForName(
-						"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
-			}
+        // As Smart Chinese is not available during compile time,
+        // we need to resort to reflection.
+        final Class<?> tokenizerClass = ReflectionUtils.classForName(
+            "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+        this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
+            Reader.class).newInstance((Reader) null);
+        this.tokenFilterClass = ReflectionUtils.classForName(
+            "org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+      }

-			public short nextToken() throws IOException {
-				final boolean hasNextToken = wordTokenFilter.incrementToken();
-				if (hasNextToken) {
-					short flags = 0;
-					final char[] image = term.buffer();
-					final int length = term.length();
-					tempCharSequence.reset(image, 0, length);
-					if (length == 1 && image[0] == ',') {
-						// ChineseTokenizer seems to convert all punctuation to ','
-						// characters
-						flags = ITokenizer.TT_PUNCTUATION;
-					} else if (numeric.matcher(tempCharSequence).matches()) {
-						flags = ITokenizer.TT_NUMERIC;
-					} else {
-						flags = ITokenizer.TT_TERM;
-					}
-					return flags;
-				}
+      public short nextToken() throws IOException {
+        final boolean hasNextToken = wordTokenFilter.incrementToken();
+        if (hasNextToken) {
+          short flags = 0;
+          final char[] image = term.buffer();
+          final int length = term.length();
+          tempCharSequence.reset(image, 0, length);
+          if (length == 1 && image[0] == ',') {
+            // ChineseTokenizer seems to convert all punctuation to ','
+            // characters
+            flags = ITokenizer.TT_PUNCTUATION;
+          } else if (numeric.matcher(tempCharSequence).matches()) {
+            flags = ITokenizer.TT_NUMERIC;
+          } else {
+            flags = ITokenizer.TT_TERM;
+          }
+          return flags;
+        }

-				return ITokenizer.TT_EOF;
-			}
+        return ITokenizer.TT_EOF;
+      }

-			public void setTermBuffer(MutableCharArray array) {
-				array.reset(term.buffer(), 0, term.length());
-			}
+      public void setTermBuffer(MutableCharArray array) {
+        array.reset(term.buffer(), 0, term.length());
+      }

-			public void reset(Reader input) throws IOException {
-				try {
-					sentenceTokenizer.reset(input);
-					wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
-							TokenStream.class).newInstance(sentenceTokenizer);
+      public void reset(Reader input) throws IOException {
+        try {
+          sentenceTokenizer.reset(input);
+          wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
+              TokenStream.class).newInstance(sentenceTokenizer);
          term = wordTokenFilter.addAttribute(CharTermAttribute.class);
-				} catch (Exception e) {
-					throw ExceptionUtils.wrapAsRuntimeException(e);
-				}
-			}
-		}
-	}
+        } catch (Exception e) {
+          throw ExceptionUtils.wrapAsRuntimeException(e);
+        }
+      }
+    }
+  }
 }
--- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
+++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
@ -53,89 +53,89 @@ import com.google.common.collect.Multimap;
 */
@Bindable
 public class SolrStopwordsCarrot2LexicalDataFactory implements
-		ILexicalDataFactory {
-	final static Logger logger = org.slf4j.LoggerFactory
-			.getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);
+    ILexicalDataFactory {
+  final static Logger logger = org.slf4j.LoggerFactory
+      .getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);

-	@Init
-	@Input
-	@Attribute(key = "solrIndexSchema")
-	private IndexSchema schema;
+  @Init
+  @Input
+  @Attribute(key = "solrIndexSchema")
+  private IndexSchema schema;

-	@Processing
-	@Input
-	@Attribute(key = "solrFieldNames")
-	private Set<String> fieldNames;
+  @Processing
+  @Input
+  @Attribute(key = "solrFieldNames")
+  private Set<String> fieldNames;

-	/**
-	 * A lazily-built cache of stop words per field.
-	 */
-	private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
+  /**
+   * A lazily-built cache of stop words per field.
+   */
+  private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();

-	/**
-	 * Carrot2's default lexical resources to use in addition to Solr's stop
-	 * words.
-	 */
-	private DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
+  /**
+   * Carrot2's default lexical resources to use in addition to Solr's stop
+   * words.
+   */
+  private DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();

-	/**
-	 * Obtains stop words for a field from the associated
-	 * {@link StopFilterFactory}, if any.
-	 */
-	private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
-		// No need to synchronize here, Carrot2 ensures that instances
-		// of this class are not used by multiple threads at a time.
-		if (!solrStopWords.containsKey(fieldName)) {
-			final Analyzer fieldAnalyzer = schema.getFieldType(fieldName)
-					.getAnalyzer();
-			if (fieldAnalyzer instanceof TokenizerChain) {
-				final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
-						.getTokenFilterFactories();
-				for (TokenFilterFactory factory : filterFactories) {
-					if (factory instanceof StopFilterFactory) {
-						// StopFilterFactory holds the stop words in a CharArraySet, but
-						// the getStopWords() method returns a Set<?>, so we need to cast.
-						solrStopWords.put(fieldName,
-								(CharArraySet) ((StopFilterFactory) factory).getStopWords());
-					}
+  /**
+   * Obtains stop words for a field from the associated
+   * {@link StopFilterFactory}, if any.
+   */
+  private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
+    // No need to synchronize here, Carrot2 ensures that instances
+    // of this class are not used by multiple threads at a time.
+    if (!solrStopWords.containsKey(fieldName)) {
+      final Analyzer fieldAnalyzer = schema.getFieldType(fieldName)
+          .getAnalyzer();
+      if (fieldAnalyzer instanceof TokenizerChain) {
+        final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
+            .getTokenFilterFactories();
+        for (TokenFilterFactory factory : filterFactories) {
+          if (factory instanceof StopFilterFactory) {
+            // StopFilterFactory holds the stop words in a CharArraySet, but
+            // the getStopWords() method returns a Set<?>, so we need to cast.
+            solrStopWords.put(fieldName,
+                (CharArraySet) ((StopFilterFactory) factory).getStopWords());
+          }

-					if (factory instanceof CommonGramsFilterFactory) {
-						solrStopWords.put(fieldName,
-								(CharArraySet) ((CommonGramsFilterFactory) factory)
-										.getCommonWords());
-					}
-				}
-			}
-		}
-		return solrStopWords.get(fieldName);
-	}
+          if (factory instanceof CommonGramsFilterFactory) {
+            solrStopWords.put(fieldName,
+                (CharArraySet) ((CommonGramsFilterFactory) factory)
+                    .getCommonWords());
+          }
+        }
+      }
+    }
+    return solrStopWords.get(fieldName);
+  }

-	@Override
-	public ILexicalData getLexicalData(LanguageCode languageCode) {
-		final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
-				.getLexicalData(languageCode);
+  @Override
+  public ILexicalData getLexicalData(LanguageCode languageCode) {
+    final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
+        .getLexicalData(languageCode);

-		return new ILexicalData() {
-			@Override
-			public boolean isStopLabel(CharSequence word) {
-				// Nothing in Solr maps to the concept of a stop label,
-				// so return Carrot2's default here.
-				return carrot2LexicalData.isStopLabel(word);
-			}
+    return new ILexicalData() {
+      @Override
+      public boolean isStopLabel(CharSequence word) {
+        // Nothing in Solr maps to the concept of a stop label,
+        // so return Carrot2's default here.
+        return carrot2LexicalData.isStopLabel(word);
+      }

-			@Override
-			public boolean isCommonWord(MutableCharArray word) {
-				// Loop over the fields involved in clustering first
-				for (String fieldName : fieldNames) {
-					for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
-						if (stopWords.contains(word)) {
-							return true;
-						}
-					}
-				}
-				// Check default Carrot2 stop words too
-				return carrot2LexicalData.isCommonWord(word);
-			}
-		};
-	}
+      @Override
+      public boolean isCommonWord(MutableCharArray word) {
+        // Loop over the fields involved in clustering first
+        for (String fieldName : fieldNames) {
+          for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
+            if (stopWords.contains(word)) {
+              return true;
+            }
+          }
+        }
+        // Check default Carrot2 stop words too
+        return carrot2LexicalData.isCommonWord(word);
+      }
+    };
+  }
 }
--- a/solr/contrib/clustering/src/test-files/clustering/solr/conf/schema.xml
+++ b/solr/contrib/clustering/src/test-files/clustering/solr/conf/schema.xml
@ -280,8 +280,10 @@

   <field name="id" type="string" indexed="true" stored="true" required="true" />
   <field name="url" type="string" indexed="true" stored="true" required="true" />
+   <field name="lang" type="string" indexed="true" stored="true" required="false" multiValued="true" />

   <field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
+   <field name="heading" type="text" indexed="true" stored="true" multiValued="true"/>
   <field name="snippet" type="text" indexed="true" stored="true" multiValued="true"/>
   <field name="body" type="text" indexed="true" stored="true" multiValued="true"/>
   <!-- catchall field, containing all other searchable text fields (implemented
--- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java
+++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java
@ -17,6 +17,7 @@ package org.apache.solr.handler.clustering;
 */

 import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.SolrInputDocument;
 import org.junit.BeforeClass;


@ -34,6 +35,54 @@ public abstract class AbstractClusteringTestCase extends SolrTestCaseJ4 {
      assertNull(h.validateUpdate(adoc("id", Integer.toString(numberOfDocs), "url", doc[0], "title", doc[1], "snippet", doc[2])));
      numberOfDocs++;
    }
+    
+    // Add a multi-valued snippet
+    final SolrInputDocument multiValuedSnippet = new SolrInputDocument();
+    multiValuedSnippet.addField("id", numberOfDocs++);
+    multiValuedSnippet.addField("title", "Title");
+    multiValuedSnippet.addField("url", "URL");
+    multiValuedSnippet.addField("snippet", "First value of multi field. Some more text. And still more.");
+    multiValuedSnippet.addField("snippet", "Second value of multi field. Some more text. And still more.");
+    multiValuedSnippet.addField("snippet", "Third value of multi field. Some more text. And still more.");
+    assertNull(h.validateUpdate(adoc(multiValuedSnippet)));
+
+    // Add a document with multi-field title and snippet
+    final SolrInputDocument multiFieldDoc = new SolrInputDocument();
+    multiFieldDoc.addField("id", numberOfDocs++);
+    multiFieldDoc.addField("title", "Title field");
+    multiFieldDoc.addField("heading", "Heading field");
+    multiFieldDoc.addField("url", "URL");
+    multiFieldDoc.addField("snippet", "Snippet field: this is the contents of the snippet field.");
+    multiFieldDoc.addField("body", "Body field: this is the contents of the body field that will get clustered together with snippet.");
+    assertNull(h.validateUpdate(adoc(multiFieldDoc)));
+    
+    // Add a document with one language supported by Carrot2
+    final SolrInputDocument docWithOneSupprtedLanguage = new SolrInputDocument();
+    docWithOneSupprtedLanguage.addField("id", numberOfDocs++);
+    docWithOneSupprtedLanguage.addField("title", "");
+    docWithOneSupprtedLanguage.addField("url", "one_supported_language");
+    docWithOneSupprtedLanguage.addField("lang", "zh-cn");
+    assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguage)));
+    
+    // Add a document with more languages, one supported by Carrot2
+    final SolrInputDocument docWithOneSupprtedLanguageOfMany = new SolrInputDocument();
+    docWithOneSupprtedLanguageOfMany.addField("id", numberOfDocs++);
+    docWithOneSupprtedLanguageOfMany.addField("url", "one_supported_language_of_many");
+    docWithOneSupprtedLanguageOfMany.addField("lang", "zh-tw");
+    docWithOneSupprtedLanguageOfMany.addField("lang", "POLISH");
+    docWithOneSupprtedLanguageOfMany.addField("lang", "de");
+    assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguageOfMany)));
+    
+    // Add a document with more languages, one supported by Carrot2
+    final SolrInputDocument docWithCustomFields = new SolrInputDocument();
+    docWithCustomFields.addField("id", numberOfDocs++);
+    docWithCustomFields.addField("url", "custom_fields");
+    docWithCustomFields.addField("intfield_i", 10);
+    docWithCustomFields.addField("floatfield_f", 10.5);
+    docWithCustomFields.addField("heading", "first");
+    docWithCustomFields.addField("heading", "second");
+    assertNull(h.validateUpdate(adoc(docWithCustomFields)));
+    
    assertNull(h.validateUpdate(commit()));
  }

--- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
+++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
@ -39,6 +39,7 @@ import org.apache.solr.search.DocList;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.util.RefCounted;
 import org.apache.solr.util.SolrPluginUtils;
+import org.carrot2.core.LanguageCode;
 import org.carrot2.util.attribute.AttributeUtils;
 import org.junit.Test;

@ -50,10 +51,10 @@ import com.google.common.collect.ImmutableList;
 public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
  @Test
  public void testCarrotLingo() throws Exception {
-  	// Note: the expected number of clusters may change after upgrading Carrot2
-  	// due to e.g. internal improvements or tuning of Carrot2 clustering.
+    // Note: the expected number of clusters may change after upgrading Carrot2
+    // due to e.g. internal improvements or tuning of Carrot2 clustering.
    final int expectedNumClusters = 10;
-		checkEngine(getClusteringEngine("default"), expectedNumClusters);
+    checkEngine(getClusteringEngine("default"), expectedNumClusters);
  }

  @Test
@ -88,10 +89,15 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {

  private List<NamedList<Object>> clusterWithHighlighting(
      boolean enableHighlighting, int fragSize) throws IOException {
+    // Some documents don't have mining in the snippet
+    return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 7);
+  }
+
+  private List<NamedList<Object>> clusterWithHighlighting(
+      boolean enableHighlighting, int fragSize, int summarySnippets,
+      String term, int expectedNumDocuments) throws IOException {
    
-    final TermQuery query = new TermQuery(new Term("snippet", "mine"));
-    // Two documents don't have mining in the snippet
-    int expectedNumDocuments = numberOfDocs - 2;
+    final TermQuery query = new TermQuery(new Term("snippet", term));

    final ModifiableSolrParams summaryParams = new ModifiableSolrParams();
    summaryParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
@ -99,6 +105,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
        Boolean.toString(enableHighlighting));
    summaryParams
        .add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(fragSize));
+    summaryParams
+        .add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(summarySnippets));
    final List<NamedList<Object>> summaryClusters = checkEngine(
        getClusteringEngine("echo"), expectedNumDocuments,
        expectedNumDocuments, query, summaryParams);
@ -169,66 +177,180 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
            params), 1, 3, 0);
  }

-	@Test
-	public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
-		checkLexicalResourcesFromSolrConfig("lexical-resource-check",
-				"online,customsolrstopword,customsolrstoplabel");
-	}
+  @Test
+  public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
+    checkLexicalResourcesFromSolrConfig("lexical-resource-check",
+        "online,customsolrstopword,customsolrstoplabel");
+  }

-	@Test
-	public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
-		checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
-				"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
-	}
+  @Test
+  public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
+    checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
+        "online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
+  }

-	private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
-			throws IOException {
-		ModifiableSolrParams params = new ModifiableSolrParams();
-		params.set("merge-resources", false);
-		params.set(AttributeUtils.getKey(
-				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
-				wordsToCheck);
+  private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
+      throws IOException {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.set("merge-resources", false);
+    params.set(AttributeUtils.getKey(
+        LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+        wordsToCheck);

-		// "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
-		// stoplabels.mt, so we're expecting only one cluster with label "online".
-		final List<NamedList<Object>> clusters = checkEngine(
-				getClusteringEngine(engineName), 1, params);
-		assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
-	}
+    // "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
+    // stoplabels.mt, so we're expecting only one cluster with label "online".
+    final List<NamedList<Object>> clusters = checkEngine(
+        getClusteringEngine(engineName), 1, params);
+    assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+  }

-	@Test
-	public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
-		ModifiableSolrParams params = new ModifiableSolrParams();
-		params.set("merge-resources", false);
-		params.set(AttributeUtils.getKey(
-				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
-		"online,solrownstopword");
+  @Test
+  public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.set("merge-resources", false);
+    params.set(AttributeUtils.getKey(
+        LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+    "online,solrownstopword");

-		// "solrownstopword" is in stopwords.txt, so we're expecting
-		// only one cluster with label "online".
-		final List<NamedList<Object>> clusters = checkEngine(
-				getClusteringEngine("lexical-resource-check"), 1, params);
-		assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
-	}
+    // "solrownstopword" is in stopwords.txt, so we're expecting
+    // only one cluster with label "online".
+    final List<NamedList<Object>> clusters = checkEngine(
+        getClusteringEngine("lexical-resource-check"), 1, params);
+    assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+  }

-	@Test
-	public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
-		ModifiableSolrParams params = new ModifiableSolrParams();
-		// Force string fields to be used for clustering. Does not make sense
-		// in a real word, but does the job in the test.
-		params.set(CarrotParams.TITLE_FIELD_NAME, "url");
-		params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
-		params.set("merge-resources", false);
-		params.set(AttributeUtils.getKey(
-				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
-		"online,solrownstopword");
+  @Test
+  public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    // Force string fields to be used for clustering. Does not make sense
+    // in a real word, but does the job in the test.
+    params.set(CarrotParams.TITLE_FIELD_NAME, "url");
+    params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
+    params.set("merge-resources", false);
+    params.set(AttributeUtils.getKey(
+        LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+    "online,solrownstopword");

-		final List<NamedList<Object>> clusters = checkEngine(
-				getClusteringEngine("lexical-resource-check"), 2, params);
-		assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
-		assertEquals(ImmutableList.of("solrownstopword"),
-				getLabels(clusters.get(1)));
-	}
+    final List<NamedList<Object>> clusters = checkEngine(
+        getClusteringEngine("lexical-resource-check"), 2, params);
+    assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
+    assertEquals(ImmutableList.of("solrownstopword"),
+        getLabels(clusters.get(1)));
+  }
+  
+  @Test
+  public void highlightingOfMultiValueField() throws Exception {
+    final String snippetWithoutSummary = getLabels(clusterWithHighlighting(
+        false, 30, 3, "multi", 1).get(0)).get(1);
+    assertTrue("Snippet contains first value", snippetWithoutSummary.contains("First"));
+    assertTrue("Snippet contains second value", snippetWithoutSummary.contains("Second"));
+    assertTrue("Snippet contains third value", snippetWithoutSummary.contains("Third"));
+
+    final String snippetWithSummary = getLabels(clusterWithHighlighting(
+        true, 30, 3, "multi", 1).get(0)).get(1);
+    assertTrue("Snippet with summary shorter than full snippet",
+        snippetWithoutSummary.length() > snippetWithSummary.length());
+    assertTrue("Summary covers first value", snippetWithSummary.contains("First"));
+    assertTrue("Summary covers second value", snippetWithSummary.contains("Second"));
+    assertTrue("Summary covers third value", snippetWithSummary.contains("Third"));
+  }
+  
+  @Test
+  public void concatenatingMultipleFields() throws Exception {
+    final ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
+    params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
+
+    final List<String> labels = getLabels(checkEngine(
+        getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("body",
+            "snippet")), params).get(0));
+    assertTrue("Snippet contains third value", labels.get(0).contains("Title field"));
+    assertTrue("Snippet contains third value", labels.get(0).contains("Heading field"));
+    assertTrue("Snippet contains third value", labels.get(1).contains("Snippet field"));
+    assertTrue("Snippet contains third value", labels.get(1).contains("Body field"));
+  }
+
+  @Test
+  public void highlightingMultipleFields() throws Exception {
+    final TermQuery query = new TermQuery(new Term("snippet", "content"));
+
+    final ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
+    params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
+    params.add(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(false));
+    
+    final String snippetWithoutSummary = getLabels(checkEngine(
+        getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);
+    assertTrue("Snippet covers snippet field", snippetWithoutSummary.contains("snippet field"));
+    assertTrue("Snippet covers body field", snippetWithoutSummary.contains("body field"));
+
+    params.set(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(true));
+    params.add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(30));
+    params.add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(2));
+    final String snippetWithSummary = getLabels(checkEngine(
+        getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);    
+    assertTrue("Snippet with summary shorter than full snippet",
+        snippetWithoutSummary.length() > snippetWithSummary.length());
+    assertTrue("Snippet covers snippet field", snippetWithSummary.contains("snippet field"));
+    assertTrue("Snippet covers body field", snippetWithSummary.contains("body field"));
+
+  }
+
+  @Test
+  public void oneCarrot2SupportedLanguage() throws Exception {
+    final ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
+
+    final List<String> labels = getLabels(checkEngine(
+        getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
+            "one_supported_language")), params).get(0));
+    assertEquals(3, labels.size());
+    assertEquals("Correct Carrot2 language", LanguageCode.CHINESE_SIMPLIFIED.name(), labels.get(2));
+  }
+  
+  @Test
+  public void oneCarrot2SupportedLanguageOfMany() throws Exception {
+    final ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
+    
+    final List<String> labels = getLabels(checkEngine(
+        getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
+            "one_supported_language_of_many")), params).get(0));
+    assertEquals(3, labels.size());
+    assertEquals("Correct Carrot2 language", LanguageCode.GERMAN.name(), labels.get(2));
+  }
+  
+  @Test
+  public void languageCodeMapping() throws Exception {
+    final ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
+    params.add(CarrotParams.LANGUAGE_CODE_MAP, "POLISH:pl");
+    
+    final List<String> labels = getLabels(checkEngine(
+        getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
+            "one_supported_language_of_many")), params).get(0));
+    assertEquals(3, labels.size());
+    assertEquals("Correct Carrot2 language", LanguageCode.POLISH.name(), labels.get(2));
+  }
+  
+  @Test
+  public void passingOfCustomFields() throws Exception {
+    final ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add(CarrotParams.CUSTOM_FIELD_NAME, "intfield_i:intfield");
+    params.add(CarrotParams.CUSTOM_FIELD_NAME, "floatfield_f:floatfield");
+    params.add(CarrotParams.CUSTOM_FIELD_NAME, "heading:multi");
+    
+    // Let the echo mock clustering algorithm know which custom field to echo
+    params.add("custom-fields", "intfield,floatfield,multi");
+    
+    final List<String> labels = getLabels(checkEngine(
+        getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
+            "custom_fields")), params).get(0));
+    assertEquals(5, labels.size());
+    assertEquals("Integer field", "10", labels.get(2));
+    assertEquals("Float field", "10.5", labels.get(3));
+    assertEquals("List field", "[first, second]", labels.get(4));
+  }

  private CarrotClusteringEngine getClusteringEngine(String engineName) {
    ClusteringComponent comp = (ClusteringComponent) h.getCore()
@ -273,7 +395,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
      SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );

      @SuppressWarnings("unchecked")
-			List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
+      List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
      req.close();
      assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
      checkClusters(results, false);
@ -302,7 +424,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
    List<Object> docs = getDocs(cluster);
    assertNotNull("docs is null and it shouldn't be", docs);
    for (int j = 0; j < docs.size(); j++) {
-      String id = (String) docs.get(j);
+      Object id = docs.get(j);
      assertNotNull("id is null and it shouldn't be", id);
    }

@ -331,26 +453,26 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
    }
  }

-	@SuppressWarnings("unchecked")
-	private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
-		return (List<NamedList<Object>>) cluster.get("clusters");
-	}
+  @SuppressWarnings("unchecked")
+  private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
+    return (List<NamedList<Object>>) cluster.get("clusters");
+  }

-	@SuppressWarnings("unchecked")
-	private List<String> getLabels(NamedList<Object> cluster) {
-		return (List<String>) cluster.get("labels");
-	}
+  @SuppressWarnings("unchecked")
+  private List<String> getLabels(NamedList<Object> cluster) {
+    return (List<String>) cluster.get("labels");
+  }

-	private Double getScore(NamedList<Object> cluster) {
-	  return (Double) cluster.get("score");
-	}
+  private Double getScore(NamedList<Object> cluster) {
+    return (Double) cluster.get("score");
+  }

-	private Boolean isOtherTopics(NamedList<Object> cluster) {
-	  return (Boolean)cluster.get("other-topics");
-	}
+  private Boolean isOtherTopics(NamedList<Object> cluster) {
+    return (Boolean)cluster.get("other-topics");
+  }

-	@SuppressWarnings("unchecked")
-	private List<Object> getDocs(NamedList<Object> cluster) {
-		return (List<Object>) cluster.get("docs");
-	}
+  @SuppressWarnings("unchecked")
+  private List<Object> getDocs(NamedList<Object> cluster) {
+    return (List<Object>) cluster.get("docs");
+  }
 }
--- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
+++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
@ -15,6 +15,7 @@ package org.apache.solr.handler.clustering.carrot2;
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+import java.util.Collections;
 import java.util.List;

 import org.carrot2.core.Cluster;
@ -48,6 +49,12 @@ public class EchoClusteringAlgorithm extends ProcessingComponentBase implements
  @Attribute(key = AttributeNames.CLUSTERS)
  private List<Cluster> clusters;

+  @Input
+  @Processing
+  @Attribute(key = "custom-fields")
+  private String customFields = "";
+
+  
  @Override
  public void process() throws ProcessingException {
    clusters = Lists.newArrayListWithCapacity(documents.size());
@ -55,6 +62,15 @@ public class EchoClusteringAlgorithm extends ProcessingComponentBase implements
    for (Document document : documents) {
      final Cluster cluster = new Cluster();
      cluster.addPhrases(document.getTitle(), document.getSummary());
+      if (document.getLanguage() != null) {
+        cluster.addPhrases(document.getLanguage().name());
+      }
+      for (String field : customFields.split(",")) {
+        Object value = document.getField(field);
+        if (value != null) {
+          cluster.addPhrases(value.toString());
+        }
+      }
      cluster.addDocuments(document);
      clusters.add(cluster);
    }
--- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java
+++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java
@ -25,9 +25,7 @@ import org.carrot2.core.ProcessingComponentBase;
 import org.carrot2.core.ProcessingException;
 import org.carrot2.core.attribute.AttributeNames;
 import org.carrot2.core.attribute.Processing;
-import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
 import org.carrot2.text.linguistic.ILexicalData;
-import org.carrot2.text.linguistic.ILexicalDataFactory;
 import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
 import org.carrot2.text.util.MutableCharArray;
 import org.carrot2.util.attribute.Attribute;
@ -46,37 +44,37 @@ import com.google.common.collect.Lists;
 */
@Bindable(prefix = "LexicalResourcesCheckClusteringAlgorithm")
 public class LexicalResourcesCheckClusteringAlgorithm extends
-		ProcessingComponentBase implements IClusteringAlgorithm {
+    ProcessingComponentBase implements IClusteringAlgorithm {

-	@Output
-	@Processing
-	@Attribute(key = AttributeNames.CLUSTERS)
-	private List<Cluster> clusters;
+  @Output
+  @Processing
+  @Attribute(key = AttributeNames.CLUSTERS)
+  private List<Cluster> clusters;

-	@Input
-	@Processing
-	@Attribute
-	private String wordsToCheck;
+  @Input
+  @Processing
+  @Attribute
+  private String wordsToCheck;

-	private BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
+  private BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();

-	@Override
-	public void process() throws ProcessingException {
-		clusters = Lists.newArrayList();
-		if (wordsToCheck == null) {
-			return;
-		}
+  @Override
+  public void process() throws ProcessingException {
+    clusters = Lists.newArrayList();
+    if (wordsToCheck == null) {
+      return;
+    }

-		// Test with Maltese so that the English clustering performed in other tests
-		// is not affected by the test stopwords and stoplabels.
-		ILexicalData lexicalData = preprocessing.lexicalDataFactory
-				.getLexicalData(LanguageCode.MALTESE);
+    // Test with Maltese so that the English clustering performed in other tests
+    // is not affected by the test stopwords and stoplabels.
+    ILexicalData lexicalData = preprocessing.lexicalDataFactory
+        .getLexicalData(LanguageCode.MALTESE);

-		for (String word : wordsToCheck.split(",")) {
-			if (!lexicalData.isCommonWord(new MutableCharArray(word))
-					&& !lexicalData.isStopLabel(word)) {
-				clusters.add(new Cluster(word));
-			}
-		}
-	}
+    for (String word : wordsToCheck.split(",")) {
+      if (!lexicalData.isCommonWord(new MutableCharArray(word))
+          && !lexicalData.isStopLabel(word)) {
+        clusters.add(new Cluster(word));
+      }
+    }
+  }
 }
--- a/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/SolrWriter.java
+++ b/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/SolrWriter.java
@ -17,6 +17,7 @@
 package org.apache.solr.handler.dataimport;

 import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.UpdateParams;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.update.AddUpdateCommand;
 import org.apache.solr.update.CommitUpdateCommand;
@ -27,8 +28,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.*;
-import java.util.Map;
-import java.util.Set;

 /**
 * <p> Writes documents to SOLR. </p>
@ -43,12 +42,14 @@ public class SolrWriter extends DIHWriterBase implements DIHWriter {
  static final String LAST_INDEX_KEY = "last_index_time";

  private final UpdateRequestProcessor processor;
-
+  private final int commitWithin;
+  
  SolrQueryRequest req;

  public SolrWriter(UpdateRequestProcessor processor, SolrQueryRequest req) {
    this.processor = processor;
    this.req = req;
+    commitWithin = (req != null) ? req.getParams().getInt(UpdateParams.COMMIT_WITHIN, -1): -1;
  }
  
  @Override
@ -65,6 +66,7 @@ public class SolrWriter extends DIHWriterBase implements DIHWriter {
    try {
      AddUpdateCommand command = new AddUpdateCommand(req);
      command.solrDoc = d;
+      command.commitWithin = commitWithin;
      processor.processAdd(command);
    } catch (Exception e) {
      log.warn("Error creating document : " + d, e);
--- a/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestContentStreamDataSource.java
+++ b/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestContentStreamDataSource.java
@ -24,6 +24,7 @@ import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentList;
 import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.params.UpdateParams;

 import org.junit.After;
 import org.junit.Before;
@ -80,6 +81,33 @@ public class TestContentStreamDataSource extends AbstractDataImportHandlerTestCa
    assertEquals("Hello C1", ((List)doc.getFieldValue("desc")).get(0));
  }

+  @Test
+  public void testCommitWithin() throws Exception {
+    DirectXmlRequest req = new DirectXmlRequest("/dataimport", xml);
+    ModifiableSolrParams params = params("command", "full-import", 
+        "clean", "false", UpdateParams.COMMIT, "false", 
+        UpdateParams.COMMIT_WITHIN, "1000");
+    req.setParams(params);
+    String url = "http://localhost:" + jetty.getLocalPort() + "/solr";
+    CommonsHttpSolrServer solrServer = new CommonsHttpSolrServer(url);
+    solrServer.request(req);
+    Thread.sleep(100);
+    ModifiableSolrParams queryAll = params("q", "*");
+    QueryResponse qres = solrServer.query(queryAll);
+    SolrDocumentList results = qres.getResults();
+    assertEquals(0, results.getNumFound());
+    Thread.sleep(1000);
+    for (int i = 0; i < 10; i++) {
+      qres = solrServer.query(queryAll);
+      results = qres.getResults();
+      if (2 == results.getNumFound()) {
+        return;
+      }
+      Thread.sleep(500);
+    }
+    fail("Commit should have occured but it did not");
+  }
+  
  private class SolrInstance {
    String name;
    Integer port;
--- a/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java
@ -162,13 +162,18 @@ public class QueryComponent extends SearchComponent

    //TODO: move weighting of sort
    Sort groupSort = searcher.weightSort(cmd.getSort());
+    if (groupSort == null) {
+      groupSort = Sort.RELEVANCE;
+    }
+
    // groupSort defaults to sort
    String groupSortStr = params.get(GroupParams.GROUP_SORT);
-    if (groupSort == null) {
-      groupSort = new Sort();
-    }
    //TODO: move weighting of sort
    Sort sortWithinGroup = groupSortStr == null ?  groupSort : searcher.weightSort(QueryParsing.parseSort(groupSortStr, req));
+    if (sortWithinGroup == null) {
+      sortWithinGroup = Sort.RELEVANCE;
+    }
+
    groupingSpec.setSortWithinGroup(sortWithinGroup);
    groupingSpec.setGroupSort(groupSort);

--- a/solr/core/src/test/org/apache/solr/TestDistributedGrouping.java
+++ b/solr/core/src/test/org/apache/solr/TestDistributedGrouping.java
@ -140,6 +140,7 @@ public class TestDistributedGrouping extends BaseDistributedSearchTestCase {
    query("q", "*:*", "fq", s1 + ":a", "rows", 100, "fl", "id," + i1, "group", "true", "group.field", i1, "group.limit", 10, "sort", i1 + " asc, id asc", "group.truncate", "true", "facet", "true", "facet.field", t1);

    // We cannot validate distributed grouping with scoring as first sort. since there is no global idf. We can check if no errors occur
+    simpleQuery("q", "*:*", "rows", 100, "fl", "id," + i1, "group", "true", "group.field", i1, "group.limit", 10, "sort", i1 + " desc", "group.sort", "score desc"); // SOLR-2955
    simpleQuery("q", "*:*", "rows", 100, "fl", "id," + i1, "group", "true", "group.field", i1, "group.limit", 10, "sort", "score desc, _docid_ asc, id asc");
    simpleQuery("q", "*:*", "rows", 100, "fl", "id," + i1, "group", "true", "group.field", i1, "group.limit", 10);
  }
@ -149,6 +150,7 @@ public class TestDistributedGrouping extends BaseDistributedSearchTestCase {
    for (int i = 0; i < queryParams.length; i += 2) {
      params.add(queryParams[i].toString(), queryParams[i + 1].toString());
    }
+    params.set("shards", shards);
    queryServer(params);
  }