mirror of
https://github.com/apache/lucene.git
synced 2025-02-22 18:27:21 +00:00
SOLR-2938: Clustering on multiple fields
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1212489 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8e52925f74
commit
4004e2abee
@ -18,6 +18,10 @@ $Id$
|
||||
by the clustering plugin, can be overridden by carrot.summarySnippets
|
||||
if needed (Stanislaw Osinski).
|
||||
|
||||
* SOLR-2938: Clustering on multiple fields. The carrot.title and
|
||||
carrot.snippet can now take comma- or space-separated lists of
|
||||
field names to cluster (Stanislaw Osinski).
|
||||
|
||||
================== Release 3.5.0 ==================
|
||||
|
||||
(No Changes)
|
||||
|
@ -19,6 +19,7 @@ package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
@ -294,13 +295,17 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||
private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
|
||||
SolrParams solrParams = sreq.getParams();
|
||||
|
||||
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
|
||||
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
|
||||
if (StringUtils.isBlank(snippetField)) {
|
||||
String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
|
||||
String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
|
||||
if (StringUtils.isBlank(snippetFieldSpec)) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
|
||||
+ " must not be blank.");
|
||||
}
|
||||
return Sets.newHashSet(titleField, snippetField);
|
||||
|
||||
final Set<String> fields = Sets.newHashSet();
|
||||
fields.addAll(Arrays.asList(titleFieldSpec.split("[, ]")));
|
||||
fields.addAll(Arrays.asList(snippetFieldSpec.split("[, ]")));
|
||||
return fields;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -313,8 +318,8 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||
SolrCore core = sreq.getCore();
|
||||
|
||||
String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
|
||||
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
|
||||
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
|
||||
String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
|
||||
String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
|
||||
|
||||
// Get the documents
|
||||
boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY, false);
|
||||
@ -325,7 +330,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||
highlighter = HighlightComponent.getHighlighter(core);
|
||||
if (highlighter != null){
|
||||
Map<String, Object> args = Maps.newHashMap();
|
||||
snippetFieldAry = new String[]{snippetField};
|
||||
snippetFieldAry = snippetFieldSpec.split("[, ]");
|
||||
args.put(HighlightParams.FIELDS, snippetFieldAry);
|
||||
args.put(HighlightParams.HIGHLIGHT, "true");
|
||||
args.put(HighlightParams.SIMPLE_PRE, ""); //we don't care about actually highlighting the area
|
||||
@ -353,7 +358,8 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||
|
||||
while (docsIter.hasNext()) {
|
||||
SolrDocument sdoc = docsIter.next();
|
||||
String snippet = getValue(sdoc, snippetField);
|
||||
String snippet = null;
|
||||
|
||||
// TODO: docIds will be null when running distributed search.
|
||||
// See comment in ClusteringComponent#finishStage().
|
||||
if (produceSummary && docIds != null) {
|
||||
@ -361,24 +367,32 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
|
||||
NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
|
||||
if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
|
||||
//should only be one document with one field
|
||||
//should only be one document
|
||||
@SuppressWarnings("unchecked")
|
||||
NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
|
||||
String [] highlt = tmp.get(snippetField);
|
||||
|
||||
// Join fragments with a period, so that Carrot2 does not create
|
||||
// cross-fragment phrases, such phrases rarely make sense.
|
||||
if (highlt != null && highlt.length > 0) {
|
||||
final StringBuilder sb = new StringBuilder(highlt[0]);
|
||||
for (int i = 1; i < highlt.length; i++) {
|
||||
sb.append(" . ");
|
||||
sb.append(highlt[i]);
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (int j = 0; j < snippetFieldAry.length; j++) {
|
||||
// Join fragments with a period, so that Carrot2 does not create
|
||||
// cross-fragment phrases, such phrases rarely make sense.
|
||||
String [] highlt = tmp.get(snippetFieldAry[j]);
|
||||
if (highlt != null && highlt.length > 0) {
|
||||
for (int i = 0; i < highlt.length; i++) {
|
||||
sb.append(highlt[i]);
|
||||
sb.append(" . ");
|
||||
}
|
||||
}
|
||||
snippet = sb.toString();
|
||||
}
|
||||
snippet = sb.toString();
|
||||
}
|
||||
}
|
||||
Document carrotDocument = new Document(getValue(sdoc, titleField),
|
||||
|
||||
// If summaries not enabled or summary generation failed, use full content.
|
||||
if (snippet == null) {
|
||||
snippet = getConcatenated(sdoc, snippetFieldSpec);
|
||||
}
|
||||
|
||||
Document carrotDocument = new Document(getConcatenated(sdoc, titleFieldSpec),
|
||||
snippet, (String)sdoc.getFieldValue(urlField));
|
||||
carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
|
||||
result.add(carrotDocument);
|
||||
@ -387,16 +401,18 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||
return result;
|
||||
}
|
||||
|
||||
protected String getValue(SolrDocument sdoc, String field) {
|
||||
private String getConcatenated(SolrDocument sdoc, String fieldsSpec) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
Collection<Object> vals = sdoc.getFieldValues(field);
|
||||
if(vals == null) return "";
|
||||
Iterator<Object> ite = vals.iterator();
|
||||
while(ite.hasNext()){
|
||||
// Join multiple values with a period so that Carrot2 does not pick up
|
||||
// phrases that cross field value boundaries (in most cases it would
|
||||
// create useless phrases).
|
||||
result.append((String)ite.next()).append(" . ");
|
||||
for (String field : fieldsSpec.split("[, ]")) {
|
||||
Collection<Object> vals = sdoc.getFieldValues(field);
|
||||
if (vals == null) continue;
|
||||
Iterator<Object> ite = vals.iterator();
|
||||
while(ite.hasNext()){
|
||||
// Join multiple values with a period so that Carrot2 does not pick up
|
||||
// phrases that cross field value boundaries (in most cases it would
|
||||
// create useless phrases).
|
||||
result.append((String)ite.next()).append(" . ");
|
||||
}
|
||||
}
|
||||
return result.toString().trim();
|
||||
}
|
||||
|
@ -282,6 +282,7 @@
|
||||
<field name="url" type="string" indexed="true" stored="true" required="true" />
|
||||
|
||||
<field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
<field name="heading" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
<field name="snippet" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
<field name="body" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
<!-- catchall field, containing all other searchable text fields (implemented
|
||||
|
@ -46,6 +46,16 @@ public abstract class AbstractClusteringTestCase extends SolrTestCaseJ4 {
|
||||
multiValuedSnippet.addField("snippet", "Third value of multi field. Some more text. And still more.");
|
||||
assertNull(h.validateUpdate(adoc(multiValuedSnippet)));
|
||||
|
||||
// Add a document with multi-field title and snippet
|
||||
final SolrInputDocument multiFieldDoc = new SolrInputDocument();
|
||||
multiFieldDoc.addField("id", numberOfDocs++);
|
||||
multiFieldDoc.addField("title", "Title field");
|
||||
multiFieldDoc.addField("heading", "Heading field");
|
||||
multiFieldDoc.addField("url", "URL");
|
||||
multiFieldDoc.addField("snippet", "Snippet field: this is the contents of the snippet field.");
|
||||
multiFieldDoc.addField("body", "Body field: this is the contents of the body field that will get clustered together with snippet.");
|
||||
assertNull(h.validateUpdate(adoc(multiFieldDoc)));
|
||||
|
||||
assertNull(h.validateUpdate(commit()));
|
||||
}
|
||||
|
||||
|
@ -88,8 +88,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
||||
|
||||
private List<NamedList<Object>> clusterWithHighlighting(
|
||||
boolean enableHighlighting, int fragSize) throws IOException {
|
||||
// Two documents don't have mining in the snippet
|
||||
return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 2);
|
||||
// Some documents don't have mining in the snippet
|
||||
return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 4);
|
||||
}
|
||||
|
||||
private List<NamedList<Object>> clusterWithHighlighting(
|
||||
@ -253,6 +253,47 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
||||
assertTrue("Summary covers second value", snippetWithSummary.contains("Second"));
|
||||
assertTrue("Summary covers third value", snippetWithSummary.contains("Third"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void concatenatingMultipleFields() throws Exception {
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
|
||||
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
|
||||
|
||||
final List<String> labels = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("body",
|
||||
"snippet")), params).get(0));
|
||||
assertTrue("Snippet contains third value", labels.get(0).contains("Title field"));
|
||||
assertTrue("Snippet contains third value", labels.get(0).contains("Heading field"));
|
||||
assertTrue("Snippet contains third value", labels.get(1).contains("Snippet field"));
|
||||
assertTrue("Snippet contains third value", labels.get(1).contains("Body field"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void highlightingMultipleFields() throws Exception {
|
||||
final TermQuery query = new TermQuery(new Term("snippet", "content"));
|
||||
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
|
||||
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
|
||||
params.add(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(false));
|
||||
|
||||
final String snippetWithoutSummary = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);
|
||||
assertTrue("Snippet covers snippet field", snippetWithoutSummary.contains("snippet field"));
|
||||
assertTrue("Snippet covers body field", snippetWithoutSummary.contains("body field"));
|
||||
|
||||
params.set(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(true));
|
||||
params.add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(30));
|
||||
params.add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(2));
|
||||
final String snippetWithSummary = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);
|
||||
assertTrue("Snippet with summary shorter than full snippet",
|
||||
snippetWithoutSummary.length() > snippetWithSummary.length());
|
||||
assertTrue("Snippet covers snippet field", snippetWithSummary.contains("snippet field"));
|
||||
assertTrue("Snippet covers body field", snippetWithSummary.contains("body field"));
|
||||
|
||||
}
|
||||
|
||||
private CarrotClusteringEngine getClusteringEngine(String engineName) {
|
||||
ClusteringComponent comp = (ClusteringComponent) h.getCore()
|
||||
|
Loading…
x
Reference in New Issue
Block a user