SOLR-2938: Clustering on multiple fields

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1212489 13f79535-47bb-0310-9956-ffa450edef68
2025-02-22 18:27:21 +00:00 · 2011-12-09 15:58:42 +00:00 · 2011-12-09 15:58:42 +00:00 · 4004e2abee
commit 4004e2abee
parent 8e52925f74
5 changed files with 102 additions and 30 deletions
--- a/solr/contrib/clustering/CHANGES.txt
+++ b/solr/contrib/clustering/CHANGES.txt
@ -18,6 +18,10 @@ $Id$
  by the clustering plugin, can be overridden by carrot.summarySnippets
  if needed (Stanislaw Osinski).

+* SOLR-2938: Clustering on multiple fields. The carrot.title and 
+  carrot.snippet can now take comma- or space-separated lists of
+  field names to cluster (Stanislaw Osinski).
+
 ================== Release 3.5.0 ==================

 (No Changes)
--- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
+++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
@ -19,6 +19,7 @@ package org.apache.solr.handler.clustering.carrot2;

 import java.io.*;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
@ -294,13 +295,17 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
  private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
    SolrParams solrParams = sreq.getParams();

-    String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
-    String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
-    if (StringUtils.isBlank(snippetField)) {
+    String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
+    String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
+    if (StringUtils.isBlank(snippetFieldSpec)) {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
              + " must not be blank.");
    }
-    return Sets.newHashSet(titleField, snippetField);
+    
+    final Set<String> fields = Sets.newHashSet();
+    fields.addAll(Arrays.asList(titleFieldSpec.split("[, ]")));
+    fields.addAll(Arrays.asList(snippetFieldSpec.split("[, ]")));
+    return fields;
  }

  /**
@ -313,8 +318,8 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
    SolrCore core = sreq.getCore();

    String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
-    String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
-    String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
+    String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
+    String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
    
    // Get the documents
    boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY, false);
@ -325,7 +330,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
      highlighter = HighlightComponent.getHighlighter(core);
      if (highlighter != null){
        Map<String, Object> args = Maps.newHashMap();
-        snippetFieldAry = new String[]{snippetField};
+        snippetFieldAry = snippetFieldSpec.split("[, ]");
        args.put(HighlightParams.FIELDS, snippetFieldAry);
        args.put(HighlightParams.HIGHLIGHT, "true");
        args.put(HighlightParams.SIMPLE_PRE, ""); //we don't care about actually highlighting the area
@ -353,7 +358,8 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {

    while (docsIter.hasNext()) {
      SolrDocument sdoc = docsIter.next();
-      String snippet = getValue(sdoc, snippetField);
+      String snippet = null;
+      
      // TODO: docIds will be null when running distributed search.
      // See comment in ClusteringComponent#finishStage().
      if (produceSummary && docIds != null) {
@ -361,24 +367,32 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
        DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
        NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
        if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
-          //should only be one document with one field
+          //should only be one document
          @SuppressWarnings("unchecked")
          NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
-          String [] highlt = tmp.get(snippetField);
          
-          // Join fragments with a period, so that Carrot2 does not create
-          // cross-fragment phrases, such phrases rarely make sense.
-          if (highlt != null && highlt.length > 0) {
-            final StringBuilder sb = new StringBuilder(highlt[0]);
-            for (int i = 1; i < highlt.length; i++) {
-              sb.append(" . ");
-              sb.append(highlt[i]);
+          final StringBuilder sb = new StringBuilder();
+          for (int j = 0; j < snippetFieldAry.length; j++) {
+            // Join fragments with a period, so that Carrot2 does not create
+            // cross-fragment phrases, such phrases rarely make sense.
+            String [] highlt = tmp.get(snippetFieldAry[j]);
+            if (highlt != null && highlt.length > 0) {
+              for (int i = 0; i < highlt.length; i++) {
+                sb.append(highlt[i]);
+                sb.append(" . ");
+              }
            }
-            snippet = sb.toString();
          }
+          snippet = sb.toString();
        }
      }
-      Document carrotDocument = new Document(getValue(sdoc, titleField),
+      
+      // If summaries not enabled or summary generation failed, use full content.
+      if (snippet == null) {
+        snippet = getConcatenated(sdoc, snippetFieldSpec);
+      }
+      
+      Document carrotDocument = new Document(getConcatenated(sdoc, titleFieldSpec),
              snippet, (String)sdoc.getFieldValue(urlField));
      carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
      result.add(carrotDocument);
@ -387,16 +401,18 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
    return result;
  }

-  protected String getValue(SolrDocument sdoc, String field) {
+  private String getConcatenated(SolrDocument sdoc, String fieldsSpec) {
    StringBuilder result = new StringBuilder();
-    Collection<Object> vals = sdoc.getFieldValues(field);
-    if(vals == null) return "";
-    Iterator<Object> ite = vals.iterator();
-    while(ite.hasNext()){
-      // Join multiple values with a period so that Carrot2 does not pick up
-      // phrases that cross field value boundaries (in most cases it would
-      // create useless phrases).
-      result.append((String)ite.next()).append(" . ");
+    for (String field : fieldsSpec.split("[, ]")) {
+      Collection<Object> vals = sdoc.getFieldValues(field);
+      if (vals == null) continue;
+      Iterator<Object> ite = vals.iterator();
+      while(ite.hasNext()){
+        // Join multiple values with a period so that Carrot2 does not pick up
+        // phrases that cross field value boundaries (in most cases it would
+        // create useless phrases).
+        result.append((String)ite.next()).append(" . ");
+      }
    }
    return result.toString().trim();
  }
--- a/solr/contrib/clustering/src/test-files/clustering/solr/conf/schema.xml
+++ b/solr/contrib/clustering/src/test-files/clustering/solr/conf/schema.xml
@ -282,6 +282,7 @@
   <field name="url" type="string" indexed="true" stored="true" required="true" />

   <field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
+   <field name="heading" type="text" indexed="true" stored="true" multiValued="true"/>
   <field name="snippet" type="text" indexed="true" stored="true" multiValued="true"/>
   <field name="body" type="text" indexed="true" stored="true" multiValued="true"/>
   <!-- catchall field, containing all other searchable text fields (implemented
--- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java
+++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java
@ -46,6 +46,16 @@ public abstract class AbstractClusteringTestCase extends SolrTestCaseJ4 {
    multiValuedSnippet.addField("snippet", "Third value of multi field. Some more text. And still more.");
    assertNull(h.validateUpdate(adoc(multiValuedSnippet)));

+    // Add a document with multi-field title and snippet
+    final SolrInputDocument multiFieldDoc = new SolrInputDocument();
+    multiFieldDoc.addField("id", numberOfDocs++);
+    multiFieldDoc.addField("title", "Title field");
+    multiFieldDoc.addField("heading", "Heading field");
+    multiFieldDoc.addField("url", "URL");
+    multiFieldDoc.addField("snippet", "Snippet field: this is the contents of the snippet field.");
+    multiFieldDoc.addField("body", "Body field: this is the contents of the body field that will get clustered together with snippet.");
+    assertNull(h.validateUpdate(adoc(multiFieldDoc)));
+    
    assertNull(h.validateUpdate(commit()));
  }

--- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
+++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
@ -88,8 +88,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {

  private List<NamedList<Object>> clusterWithHighlighting(
      boolean enableHighlighting, int fragSize) throws IOException {
-    // Two documents don't have mining in the snippet
-    return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 2);
+    // Some documents don't have mining in the snippet
+    return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 4);
  }

  private List<NamedList<Object>> clusterWithHighlighting(
@ -253,6 +253,47 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
    assertTrue("Summary covers second value", snippetWithSummary.contains("Second"));
    assertTrue("Summary covers third value", snippetWithSummary.contains("Third"));
  }
+  
+  @Test
+  public void concatenatingMultipleFields() throws Exception {
+    final ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
+    params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
+
+    final List<String> labels = getLabels(checkEngine(
+        getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("body",
+            "snippet")), params).get(0));
+    assertTrue("Snippet contains third value", labels.get(0).contains("Title field"));
+    assertTrue("Snippet contains third value", labels.get(0).contains("Heading field"));
+    assertTrue("Snippet contains third value", labels.get(1).contains("Snippet field"));
+    assertTrue("Snippet contains third value", labels.get(1).contains("Body field"));
+  }
+
+  @Test
+  public void highlightingMultipleFields() throws Exception {
+    final TermQuery query = new TermQuery(new Term("snippet", "content"));
+
+    final ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
+    params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
+    params.add(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(false));
+    
+    final String snippetWithoutSummary = getLabels(checkEngine(
+        getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);
+    assertTrue("Snippet covers snippet field", snippetWithoutSummary.contains("snippet field"));
+    assertTrue("Snippet covers body field", snippetWithoutSummary.contains("body field"));
+
+    params.set(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(true));
+    params.add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(30));
+    params.add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(2));
+    final String snippetWithSummary = getLabels(checkEngine(
+        getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);    
+    assertTrue("Snippet with summary shorter than full snippet",
+        snippetWithoutSummary.length() > snippetWithSummary.length());
+    assertTrue("Snippet covers snippet field", snippetWithSummary.contains("snippet field"));
+    assertTrue("Snippet covers body field", snippetWithSummary.contains("body field"));
+
+  }

  private CarrotClusteringEngine getClusteringEngine(String engineName) {
    ClusteringComponent comp = (ClusteringComponent) h.getCore()