SOLR-2938: Clustering on multiple fields

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1212489 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Stanisław Osiński 2011-12-09 15:58:42 +00:00
parent 8e52925f74
commit 4004e2abee
5 changed files with 102 additions and 30 deletions

View File

@ -18,6 +18,10 @@ $Id$
by the clustering plugin, can be overridden by carrot.summarySnippets
if needed (Stanislaw Osinski).
* SOLR-2938: Clustering on multiple fields. The carrot.title and
carrot.snippet can now take comma- or space-separated lists of
field names to cluster (Stanislaw Osinski).
================== Release 3.5.0 ==================
(No Changes)

View File

@ -19,6 +19,7 @@ package org.apache.solr.handler.clustering.carrot2;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
@ -294,13 +295,17 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
SolrParams solrParams = sreq.getParams();
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
if (StringUtils.isBlank(snippetField)) {
String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
if (StringUtils.isBlank(snippetFieldSpec)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
+ " must not be blank.");
}
return Sets.newHashSet(titleField, snippetField);
final Set<String> fields = Sets.newHashSet();
fields.addAll(Arrays.asList(titleFieldSpec.split("[, ]")));
fields.addAll(Arrays.asList(snippetFieldSpec.split("[, ]")));
return fields;
}
/**
@ -313,8 +318,8 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
SolrCore core = sreq.getCore();
String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
// Get the documents
boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY, false);
@ -325,7 +330,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
highlighter = HighlightComponent.getHighlighter(core);
if (highlighter != null){
Map<String, Object> args = Maps.newHashMap();
snippetFieldAry = new String[]{snippetField};
snippetFieldAry = snippetFieldSpec.split("[, ]");
args.put(HighlightParams.FIELDS, snippetFieldAry);
args.put(HighlightParams.HIGHLIGHT, "true");
args.put(HighlightParams.SIMPLE_PRE, ""); //we don't care about actually highlighting the area
@ -353,7 +358,8 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
while (docsIter.hasNext()) {
SolrDocument sdoc = docsIter.next();
String snippet = getValue(sdoc, snippetField);
String snippet = null;
// TODO: docIds will be null when running distributed search.
// See comment in ClusteringComponent#finishStage().
if (produceSummary && docIds != null) {
@ -361,24 +367,32 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
//should only be one document with one field
//should only be one document
@SuppressWarnings("unchecked")
NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
String [] highlt = tmp.get(snippetField);
// Join fragments with a period, so that Carrot2 does not create
// cross-fragment phrases, such phrases rarely make sense.
if (highlt != null && highlt.length > 0) {
final StringBuilder sb = new StringBuilder(highlt[0]);
for (int i = 1; i < highlt.length; i++) {
sb.append(" . ");
sb.append(highlt[i]);
final StringBuilder sb = new StringBuilder();
for (int j = 0; j < snippetFieldAry.length; j++) {
// Join fragments with a period, so that Carrot2 does not create
// cross-fragment phrases, such phrases rarely make sense.
String [] highlt = tmp.get(snippetFieldAry[j]);
if (highlt != null && highlt.length > 0) {
for (int i = 0; i < highlt.length; i++) {
sb.append(highlt[i]);
sb.append(" . ");
}
}
snippet = sb.toString();
}
snippet = sb.toString();
}
}
Document carrotDocument = new Document(getValue(sdoc, titleField),
// If summaries not enabled or summary generation failed, use full content.
if (snippet == null) {
snippet = getConcatenated(sdoc, snippetFieldSpec);
}
Document carrotDocument = new Document(getConcatenated(sdoc, titleFieldSpec),
snippet, (String)sdoc.getFieldValue(urlField));
carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
result.add(carrotDocument);
@ -387,16 +401,18 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
return result;
}
protected String getValue(SolrDocument sdoc, String field) {
private String getConcatenated(SolrDocument sdoc, String fieldsSpec) {
StringBuilder result = new StringBuilder();
Collection<Object> vals = sdoc.getFieldValues(field);
if(vals == null) return "";
Iterator<Object> ite = vals.iterator();
while(ite.hasNext()){
// Join multiple values with a period so that Carrot2 does not pick up
// phrases that cross field value boundaries (in most cases it would
// create useless phrases).
result.append((String)ite.next()).append(" . ");
for (String field : fieldsSpec.split("[, ]")) {
Collection<Object> vals = sdoc.getFieldValues(field);
if (vals == null) continue;
Iterator<Object> ite = vals.iterator();
while(ite.hasNext()){
// Join multiple values with a period so that Carrot2 does not pick up
// phrases that cross field value boundaries (in most cases it would
// create useless phrases).
result.append((String)ite.next()).append(" . ");
}
}
return result.toString().trim();
}

View File

@ -282,6 +282,7 @@
<field name="url" type="string" indexed="true" stored="true" required="true" />
<field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="heading" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="snippet" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="body" type="text" indexed="true" stored="true" multiValued="true"/>
<!-- catchall field, containing all other searchable text fields (implemented

View File

@ -46,6 +46,16 @@ public abstract class AbstractClusteringTestCase extends SolrTestCaseJ4 {
multiValuedSnippet.addField("snippet", "Third value of multi field. Some more text. And still more.");
assertNull(h.validateUpdate(adoc(multiValuedSnippet)));
// Add a document with multi-field title and snippet
final SolrInputDocument multiFieldDoc = new SolrInputDocument();
multiFieldDoc.addField("id", numberOfDocs++);
multiFieldDoc.addField("title", "Title field");
multiFieldDoc.addField("heading", "Heading field");
multiFieldDoc.addField("url", "URL");
multiFieldDoc.addField("snippet", "Snippet field: this is the contents of the snippet field.");
multiFieldDoc.addField("body", "Body field: this is the contents of the body field that will get clustered together with snippet.");
assertNull(h.validateUpdate(adoc(multiFieldDoc)));
assertNull(h.validateUpdate(commit()));
}

View File

@ -88,8 +88,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
private List<NamedList<Object>> clusterWithHighlighting(
boolean enableHighlighting, int fragSize) throws IOException {
// Two documents don't have mining in the snippet
return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 2);
// Some documents don't have mining in the snippet
return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 4);
}
private List<NamedList<Object>> clusterWithHighlighting(
@ -253,6 +253,47 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
assertTrue("Summary covers second value", snippetWithSummary.contains("Second"));
assertTrue("Summary covers third value", snippetWithSummary.contains("Third"));
}
@Test
public void concatenatingMultipleFields() throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
final List<String> labels = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("body",
"snippet")), params).get(0));
assertTrue("Snippet contains third value", labels.get(0).contains("Title field"));
assertTrue("Snippet contains third value", labels.get(0).contains("Heading field"));
assertTrue("Snippet contains third value", labels.get(1).contains("Snippet field"));
assertTrue("Snippet contains third value", labels.get(1).contains("Body field"));
}
@Test
public void highlightingMultipleFields() throws Exception {
final TermQuery query = new TermQuery(new Term("snippet", "content"));
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
params.add(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(false));
final String snippetWithoutSummary = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);
assertTrue("Snippet covers snippet field", snippetWithoutSummary.contains("snippet field"));
assertTrue("Snippet covers body field", snippetWithoutSummary.contains("body field"));
params.set(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(true));
params.add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(30));
params.add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(2));
final String snippetWithSummary = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);
assertTrue("Snippet with summary shorter than full snippet",
snippetWithoutSummary.length() > snippetWithSummary.length());
assertTrue("Snippet covers snippet field", snippetWithSummary.contains("snippet field"));
assertTrue("Snippet covers body field", snippetWithSummary.contains("body field"));
}
private CarrotClusteringEngine getClusteringEngine(String engineName) {
ClusteringComponent comp = (ClusteringComponent) h.getCore()