SOLR-2282: add distributed support to search results clustering

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1051715 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Koji Sekiguchi 2010-12-22 00:46:47 +00:00
parent 1d9135f0e4
commit 2c859ba49b
7 changed files with 270 additions and 60 deletions

View File

@ -6,7 +6,7 @@ See http://wiki.apache.org/solr/ClusteringComponent
CHANGES
$Id:$
$Id$
================== Release XXXX ==================
@ -17,6 +17,9 @@ $Id:$
* SOLR-1804: Re-enabled clustering on trunk, updated to latest version of Carrot2. No more LGPL run-time dependencies.
This release of C2 also does not have a specific Lucene dependency. (Stanislaw Osinski, gsingers)
* SOLR-2282: Add distributed search support for search result clustering.
(Brad Giaccio, koji)
================== Release 1.4.0 ==================
Solr Clustering will be released for the first time in Solr 1.4. See http://wiki.apache.org/solr/ClusteringComponent

View File

@ -44,6 +44,10 @@
<pathelement location="${common-solr.dir}/build/tests"/> <!-- include solr test code -->
<pathelement location="${common-solr.dir}/../lucene/build/classes/test" /> <!-- include some lucene test code -->
<path refid="common.classpath"/>
<!-- DistributedClusteringComponentTest uses Jetty -->
<fileset dir="${solr-path}/example/lib">
<include name="**/*.jar" />
</fileset>
</path>
<target name="clean">

View File

@ -16,14 +16,22 @@ package org.apache.solr.handler.clustering;
* limitations under the License.
*/
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.handler.clustering.carrot2.CarrotClusteringEngine;
import org.apache.solr.handler.clustering.carrot2.CarrotParams;
import org.apache.solr.handler.component.ResponseBuilder;
import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.handler.component.ShardRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.DocListAndSet;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -31,7 +39,9 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/**
@ -53,7 +63,7 @@ public class ClusteringComponent extends SearchComponent implements SolrCoreAwar
public static final String COMPONENT_NAME = "clustering";
private NamedList initParams;
@Override
public void prepare(ResponseBuilder rb) throws IOException {
SolrParams params = rb.req.getParams();
if (!params.getBool(COMPONENT_NAME, false)) {
@ -61,18 +71,21 @@ public class ClusteringComponent extends SearchComponent implements SolrCoreAwar
}
}
@Override
public void process(ResponseBuilder rb) throws IOException {
SolrParams params = rb.req.getParams();
if (!params.getBool(COMPONENT_NAME, false)) {
return;
}
String name = params.get(ClusteringParams.ENGINE_NAME, ClusteringEngine.DEFAULT_ENGINE_NAME);
String name = getClusteringEngineName(rb);
boolean useResults = params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false);
if (useResults == true) {
SearchClusteringEngine engine = searchClusteringEngines.get(name);
SearchClusteringEngine engine = getSearchClusteringEngine(rb);
if (engine != null) {
DocListAndSet results = rb.getResults();
Object clusters = engine.cluster(rb.getQuery(), results.docList, rb.req);
Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(results.docList.size());
SolrDocumentList solrDocList = engine.getSolrDocumentList(results.docList, rb.req, docIds);
Object clusters = engine.cluster(rb.getQuery(), solrDocList, docIds, rb.req);
rb.rsp.add("clusters", clusters);
} else {
log.warn("No engine for: " + name);
@ -97,6 +110,72 @@ public class ClusteringComponent extends SearchComponent implements SolrCoreAwar
}
}
}
private SearchClusteringEngine getSearchClusteringEngine(ResponseBuilder rb){
return searchClusteringEngines.get(getClusteringEngineName(rb));
}
private String getClusteringEngineName(ResponseBuilder rb){
return rb.req.getParams().get(ClusteringParams.ENGINE_NAME, ClusteringEngine.DEFAULT_ENGINE_NAME);
}
@Override
public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) {
SolrParams params = rb.req.getParams();
if (!params.getBool(COMPONENT_NAME, false) || !params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false)) {
return;
}
sreq.params.remove(COMPONENT_NAME);
if( ( sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS ) != 0 ){
String fl = sreq.params.get(CommonParams.FL,"*");
// if fl=* then we don't need check
if( fl.indexOf( '*' ) >= 0 ) return;
Set<String> fields = getSearchClusteringEngine(rb).getFieldsToLoad(rb.req);
if( fields == null || fields.size() == 0 ) return;
StringBuilder sb = new StringBuilder();
String[] flparams = fl.split( "[,\\s]+" );
Set<String> flParamSet = new HashSet<String>(flparams.length);
for( String flparam : flparams ){
// no need trim() because of split() by \s+
flParamSet.add(flparam);
}
for( String aFieldToLoad : fields ){
if( !flParamSet.contains( aFieldToLoad ) ){
sb.append( ',' ).append( aFieldToLoad );
}
}
if( sb.length() > 0 ){
sreq.params.set( CommonParams.FL, fl + sb.toString() );
}
}
}
@Override
public void finishStage(ResponseBuilder rb) {
SolrParams params = rb.req.getParams();
if (!params.getBool(COMPONENT_NAME, false) || !params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false)) {
return;
}
if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) {
SearchClusteringEngine engine = getSearchClusteringEngine(rb);
if (engine != null) {
SolrDocumentList solrDocList = (SolrDocumentList)rb.rsp.getValues().get("response");
// TODO: Currently, docIds is set to null in distributed environment.
// This causes CarrotParams.PRODUCE_SUMMARY doesn't work.
// To work CarrotParams.PRODUCE_SUMMARY under distributed mode, we can choose either one of:
// (a) In each shard, ClusteringComponent produces summary and finishStage()
// merges these summaries.
// (b) Adding doHighlighting(SolrDocumentList, ...) method to SolrHighlighter and
// making SolrHighlighter uses "external text" rather than stored values to produce snippets.
Map<SolrDocument,Integer> docIds = null;
Object clusters = engine.cluster(rb.getQuery(), solrDocList, docIds, rb.req);
rb.rsp.add("clusters", clusters);
} else {
String name = getClusteringEngineName(rb);
log.warn("No engine for: " + name);
}
}
}
@Override
@SuppressWarnings("unchecked")
@ -174,17 +253,17 @@ public class ClusteringComponent extends SearchComponent implements SolrCoreAwar
@Override
public String getVersion() {
return "$Revision:$";
return "$Revision$";
}
@Override
public String getSourceId() {
return "$Id:$";
return "$Id$";
}
@Override
public String getSource() {
return "$URL:$";
return "$URL$";
}
}

View File

@ -16,12 +16,16 @@ package org.apache.solr.handler.clustering;
* limitations under the License.
*/
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.DocList;
import org.apache.solr.request.SolrQueryRequest;
import java.io.IOException;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.DocList;
import org.apache.solr.util.SolrPluginUtils;
/**
@ -30,8 +34,27 @@ import org.apache.lucene.search.Query;
**/
public abstract class SearchClusteringEngine extends ClusteringEngine {
@Deprecated
public abstract Object cluster(Query query, DocList docList, SolrQueryRequest sreq);
// TODO: need DocList, too?
public abstract Object cluster(Query query, SolrDocumentList solrDocumentList,
Map<SolrDocument,Integer> docIds, SolrQueryRequest sreq);
/**
* Returns the set of field names to load.
* Concrete classes can override this method if needed.
* Default implementation returns null, that is, all stored fields are loaded.
* @param sreq
* @return set of field names to load
*/
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
return null;
}
public SolrDocumentList getSolrDocumentList(DocList docList, SolrQueryRequest sreq,
Map<SolrDocument, Integer> docIds) throws IOException{
return SolrPluginUtils.docListToSolrDocumentList(
docList, sreq.getSearcher(), getFieldsToLoad(sreq), docIds);
}
}

View File

@ -18,25 +18,38 @@ package org.apache.solr.handler.clustering.carrot2;
*/
import java.io.IOException;
import java.util.*;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.SetBasedFieldSelector;
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.SolrException;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.clustering.SearchClusteringEngine;
import org.apache.solr.highlight.SolrHighlighter;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.*;
import org.apache.solr.util.RefCounted;
import org.carrot2.core.*;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocSlice;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.SolrPluginUtils;
import org.carrot2.core.Cluster;
import org.carrot2.core.Controller;
import org.carrot2.core.ControllerFactory;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.attribute.AttributeNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -63,11 +76,25 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
private String idFieldName;
@Deprecated
public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
SolrIndexSearcher searcher = sreq.getSearcher();
SolrDocumentList solrDocList;
try {
Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(docList.size());
solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, getFieldsToLoad(sreq), docIds );
return cluster(query, solrDocList, docIds, sreq);
} catch (IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
public Object cluster(Query query, SolrDocumentList solrDocList,
Map<SolrDocument, Integer> docIds, SolrQueryRequest sreq) {
try {
// Prepare attributes for Carrot2 clustering call
Map<String, Object> attributes = new HashMap<String, Object>();
List<Document> documents = getDocuments(docList, query, sreq);
List<Document> documents = getDocuments(solrDocList, docIds, query, sreq);
attributes.put(AttributeNames.DOCUMENTS, documents);
attributes.put(AttributeNames.QUERY, query.toString());
@ -79,7 +106,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
clusteringAlgorithmClass).getClusters(), sreq.getParams());
} catch (Exception e) {
log.error("Carrot2 clustering failed", e);
throw new RuntimeException(e);
throw new SolrException(ErrorCode.SERVER_ERROR, "Carrot2 clustering failed", e);
}
}
@ -114,31 +141,36 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
return result;
}
@Override
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
SolrParams solrParams = sreq.getParams();
// Names of fields to deliver content for clustering
String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
if (StringUtils.isBlank(snippetField)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
+ " must not be blank.");
}
return Sets.newHashSet(urlField, titleField, snippetField, idFieldName);
}
/**
* Prepares Carrot2 documents for clustering.
*/
private List<Document> getDocuments(DocList docList,
private List<Document> getDocuments(SolrDocumentList solrDocList, Map<SolrDocument, Integer> docIds,
Query query, final SolrQueryRequest sreq) throws IOException {
SolrHighlighter highlighter = null;
SolrParams solrParams = sreq.getParams();
SolrCore core = sreq.getCore();
// Names of fields to deliver content for clustering
String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME,
titleField);
if (StringUtils.isBlank(snippetField)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
+ " must not be blank.");
}
Set<String> fieldsToLoad = Sets.newHashSet(urlField, titleField,
snippetField, idFieldName);
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
// Get the documents
DocIterator docsIter = docList.iterator();
boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY,
false);
boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY, false);
SolrQueryRequest req = null;
String[] snippetFieldAry = null;
@ -164,20 +196,20 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
}
}
SolrIndexSearcher searcher = sreq.getSearcher();
List<Document> result = new ArrayList<Document>(docList.size());
Iterator<SolrDocument> docsIter = solrDocList.iterator();
List<Document> result = new ArrayList<Document>(solrDocList.size());
float[] scores = {1.0f};
int[] docsHolder = new int[1];
Query theQuery = query;
while (docsIter.hasNext()) {
Integer id = docsIter.next();
org.apache.lucene.document.Document doc = searcher.doc(id,
fieldsToLoad);
String snippet = getValue(doc, snippetField);
if (produceSummary == true) {
docsHolder[0] = id.intValue();
SolrDocument sdoc = docsIter.next();
String snippet = getValue(sdoc, snippetField);
// TODO: docIds will be null when running distributed search.
// See comment in ClusteringComponent#finishStage().
if (produceSummary && docIds != null) {
docsHolder[0] = docIds.get(sdoc).intValue();
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
NamedList highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
@ -189,15 +221,16 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
}
}
}
Document carrotDocument = new Document(getValue(doc, titleField),
snippet, doc.get(urlField));
carrotDocument.setField("solrId", doc.get(idFieldName));
Document carrotDocument = new Document(getValue(sdoc, titleField),
snippet, (String)sdoc.getFieldValue(urlField));
carrotDocument.setField("solrId", sdoc.getFieldValue(idFieldName));
result.add(carrotDocument);
}
return result;
}
@Deprecated
protected String getValue(org.apache.lucene.document.Document doc,
String field) {
StringBuilder result = new StringBuilder();
@ -211,6 +244,20 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
return result.toString().trim();
}
protected String getValue(SolrDocument sdoc, String field) {
StringBuilder result = new StringBuilder();
Collection<Object> vals = sdoc.getFieldValues(field);
if(vals == null) return "";
Iterator<Object> ite = vals.iterator();
while(ite.hasNext()){
// Join multiple values with a period so that Carrot2 does not pick up
// phrases that cross field value boundaries (in most cases it would
// create useless phrases).
result.append((String)ite.next()).append(" . ");
}
return result.toString().trim();
}
private List clustersToNamedList(List<Cluster> carrotClusters,
SolrParams solrParams) {
List result = new ArrayList();

View File

@ -0,0 +1,47 @@
package org.apache.solr.handler.clustering;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.solr.BaseDistributedSearchTestCase;
import org.apache.solr.common.params.CommonParams;
public class DistributedClusteringComponentTest extends
BaseDistributedSearchTestCase {
@Override
public void doTest() throws Exception {
del("*:*");
int numberOfDocs = 0;
for (String[] doc : AbstractClusteringTestCase.DOCUMENTS) {
index(id, Integer.toString(numberOfDocs++), "url", doc[0], "title", doc[1], "snippet", doc[2]);
}
commit();
handle.clear();
// Only really care about the clusters for this test case, so drop the header and response
handle.put("responseHeader", SKIP);
handle.put("response", SKIP);
query(
ClusteringComponent.COMPONENT_NAME, "true",
CommonParams.Q, "*:*",
CommonParams.SORT, id + " desc",
ClusteringParams.USE_SEARCH_RESULTS, "true");
// destroy is not needed because tearDown method of base class does it.
//destroyServers();
}
}

View File

@ -22,6 +22,8 @@ import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
@ -31,11 +33,14 @@ import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.RefCounted;
import org.apache.solr.util.SolrPluginUtils;
import org.carrot2.util.attribute.AttributeUtils;
import org.junit.Test;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.junit.Assert.*;
@ -133,21 +138,23 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
docList = searcher.getDocList(query, (Query) null, new Sort(), 0,
numberOfDocs);
assertEquals("docList size", expectedNumDocs, docList.matches());
ModifiableSolrParams solrParams = new ModifiableSolrParams();
solrParams.add(CarrotParams.PRODUCE_SUMMARY, "true");
solrParams.add(clusteringParams);
// Perform clustering
LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(docList.size());
SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
List results = (List)engine.cluster(query, solrDocList, docIds, req);
req.close();
assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
checkClusters(results, false);
return results;
} finally {
ref.decref();
}
ModifiableSolrParams solrParams = new ModifiableSolrParams();
solrParams.add(CarrotParams.PRODUCE_SUMMARY, "true");
solrParams.add(clusteringParams);
// Perform clustering
LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
List results = (List) engine.cluster(query, docList, req);
req.close();
assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
checkClusters(results, false);
return results;
}
private void checkClusters(List results, int expectedDocCount,