SOLR-788: Distributed search support for MLT.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1421326 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2012-12-13 15:03:19 +00:00
parent 6c5f475b4d
commit acf4f12fee
6 changed files with 489 additions and 57 deletions

View File

@ -146,6 +146,9 @@ New Features
* SOLR-4140: Allow access to the collections API through CloudSolrServer
without referencing an existing collection. (Per Steffensen via Mark Miller)
* SOLR-788: Distributed search support for MLT.
(Matthew Woytowitz, Mike Anderson, Jamie Johnson, Mark Miller)
Optimizations
----------------------

View File

@ -401,6 +401,33 @@ public class MoreLikeThisHandler extends RequestHandlerBase
return mlt;
}
public NamedList<BooleanQuery> getMoreLikeTheseQuery(DocList docs)
throws IOException {
IndexSchema schema = searcher.getSchema();
NamedList<BooleanQuery> result = new NamedList<BooleanQuery>();
DocIterator iterator = docs.iterator();
while (iterator.hasNext()) {
int id = iterator.nextDoc();
BooleanQuery mltquery = (BooleanQuery) mlt.like(id);
if (mltquery.clauses().size() == 0) {
return result;
}
mltquery = (BooleanQuery) getBoostedQuery(mltquery);
// exclude current document from results
BooleanQuery mltQuery = new BooleanQuery();
mltQuery.add(mltquery, BooleanClause.Occur.MUST);
String name = schema.printableUniqueKey(reader.document(id));
// Added in-case uniqueKey is uri.
mltQuery.add(
new TermQuery(new Term(uniqueKeyField.getName(), name.replace(":",
"\\:"))), BooleanClause.Occur.MUST_NOT);
result.add(name, mltQuery);
}
return result;
}
private void fillInterestingTermsFromMLTQuery( Query query, List<InterestingTerm> terms )
{
List clauses = ((BooleanQuery)query).clauses();

View File

@ -19,8 +19,24 @@ package org.apache.solr.handler.component;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import org.apache.lucene.search.BooleanQuery;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.MoreLikeThisParams;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
@ -30,6 +46,8 @@ import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocListAndSet;
import org.apache.solr.search.SolrIndexSearcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* TODO!
@ -37,79 +55,335 @@ import org.apache.solr.search.SolrIndexSearcher;
*
* @since solr 1.3
*/
public class MoreLikeThisComponent extends SearchComponent
{
public class MoreLikeThisComponent extends SearchComponent {
public static final String COMPONENT_NAME = "mlt";
public static final String DIST_DOC_ID = "mlt.dist.id";
static final Logger log = LoggerFactory
.getLogger(MoreLikeThisComponent.class);
@Override
public void prepare(ResponseBuilder rb) throws IOException
{
public void prepare(ResponseBuilder rb) throws IOException {
}
@Override
public void process(ResponseBuilder rb) throws IOException
{
SolrParams p = rb.req.getParams();
if( p.getBool( MoreLikeThisParams.MLT, false ) ) {
public void process(ResponseBuilder rb) throws IOException {
SolrParams params = rb.req.getParams();
if (params.getBool(MoreLikeThisParams.MLT, false)) {
log.debug("Starting MoreLikeThis.Process. isShard: "
+ params.getBool(ShardParams.IS_SHARD));
SolrIndexSearcher searcher = rb.req.getSearcher();
NamedList<DocList> sim = getMoreLikeThese( rb, searcher,
rb.getResults().docList, rb.getFieldFlags() );
int mltcount = params.getInt(MoreLikeThisParams.DOC_COUNT, 20);
if (params.getBool(ShardParams.IS_SHARD, false)) {
if (params.get(MoreLikeThisComponent.DIST_DOC_ID) == null) {
MoreLikeThisHandler.MoreLikeThisHelper mlt = new MoreLikeThisHandler.MoreLikeThisHelper(
params, searcher);
// TODO ???? add this directly to the response?
rb.rsp.add( "moreLikeThis", sim );
NamedList<BooleanQuery> bQuery = mlt.getMoreLikeTheseQuery(rb
.getResults().docList);
NamedList<String> temp = new NamedList<String>();
Iterator<Entry<String,BooleanQuery>> idToQueryIt = bQuery.iterator();
while (idToQueryIt.hasNext()) {
Entry<String,BooleanQuery> idToQuery = idToQueryIt.next();
String s = idToQuery.getValue().toString();
log.debug("MLT Query:" + s);
temp.add(idToQuery.getKey(), idToQuery.getValue().toString());
}
rb.rsp.add("moreLikeThis", temp);
} else {
NamedList<DocList> sim = getMoreLikeThese(rb, rb.req.getSearcher(),
rb.getResults().docList, mltcount);
rb.rsp.add("moreLikeThis", sim);
}
} else {
// non distrib case
NamedList<DocList> sim = getMoreLikeThese(rb, rb.req.getSearcher(), rb.getResults().docList,
mltcount);
rb.rsp.add("moreLikeThis", sim);
}
}
}
NamedList<DocList> getMoreLikeThese( ResponseBuilder rb, SolrIndexSearcher searcher,
DocList docs, int flags ) throws IOException {
@Override
public void handleResponses(ResponseBuilder rb, ShardRequest sreq) {
if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_IDS) != 0
&& rb.req.getParams().getBool(COMPONENT_NAME, false)) {
log.debug("ShardRequest.response.size: " + sreq.responses.size());
for (ShardResponse r : sreq.responses) {
NamedList<?> moreLikeThisReponse = (NamedList<?>) r.getSolrResponse()
.getResponse().get("moreLikeThis");
log.debug("ShardRequest.response.shard: " + r.getShard());
if (moreLikeThisReponse != null) {
for (Entry<String,?> entry : moreLikeThisReponse) {
log.debug("id: \"" + entry.getKey() + "\" Query: \""
+ entry.getValue() + "\"");
ShardRequest s = buildShardQuery(rb, (String) entry.getValue(),
entry.getKey());
rb.addRequest(this, s);
}
}
}
}
if ((sreq.purpose & ShardRequest.PURPOSE_GET_MLT_RESULTS) != 0) {
for (ShardResponse r : sreq.responses) {
log.info("MLT Query returned: "
+ r.getSolrResponse().getResponse().toString());
}
}
}
@Override
public void finishStage(ResponseBuilder rb) {
// Handling Responses in finishStage, because solrResponse will put
// moreLikeThis xml
// segment ahead of result/response.
if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS
&& rb.req.getParams().getBool(COMPONENT_NAME, false)) {
Map<Object,SolrDocumentList> tempResults = new LinkedHashMap<Object,SolrDocumentList>();
int mltcount = rb.req.getParams().getInt(MoreLikeThisParams.DOC_COUNT, 20);
String keyName = rb.req.getSchema().getUniqueKeyField().getName();
for (ShardRequest sreq : rb.finished) {
if ((sreq.purpose & ShardRequest.PURPOSE_GET_MLT_RESULTS) != 0) {
for (ShardResponse r : sreq.responses) {
log.debug("ShardRequest.response.shard: " + r.getShard());
String key = r.getShardRequest().params
.get(MoreLikeThisComponent.DIST_DOC_ID);
SolrDocumentList shardDocList = (SolrDocumentList) r.getSolrResponse().getResponse().get("response");
if (shardDocList == null) {
continue;
}
log.info("MLT: results added for key: " + key + " documents: "
+ shardDocList.toString());
// if (log.isDebugEnabled()) {
// for (SolrDocument doc : shardDocList) {
// doc.addField("shard", "=" + r.getShard());
// }
// }
SolrDocumentList mergedDocList = tempResults.get(key);
if (mergedDocList == null) {
mergedDocList = new SolrDocumentList();
mergedDocList.addAll(shardDocList);
mergedDocList.setNumFound(shardDocList.getNumFound());
mergedDocList.setStart(shardDocList.getStart());
mergedDocList.setMaxScore(shardDocList.getMaxScore());
} else {
mergedDocList = mergeSolrDocumentList(mergedDocList,
shardDocList, mltcount, keyName);
}
log.debug("Adding docs for key: " + key);
tempResults.put(key, mergedDocList);
}
}
}
NamedList<SolrDocumentList> list = buildMoreLikeThisNamed(tempResults,
rb.resultIds);
rb.rsp.add("moreLikeThis", list);
}
super.finishStage(rb);
}
/**
* Returns NamedList based on the order of
* resultIds.shardDoc.positionInResponse
*/
NamedList<SolrDocumentList> buildMoreLikeThisNamed(
Map<Object,SolrDocumentList> allMlt, Map<Object,ShardDoc> resultIds) {
NamedList<SolrDocumentList> result = new NamedList<SolrDocumentList>();
TreeMap<Integer,Object> sortingMap = new TreeMap<Integer,Object>();
for (Entry<Object,ShardDoc> next : resultIds.entrySet()) {
sortingMap.put(next.getValue().positionInResponse, next.getKey());
}
for (Object key : sortingMap.values()) {
SolrDocumentList sdl = allMlt.get(key);
if (sdl == null) {
sdl = new SolrDocumentList();
sdl.setNumFound(0);
sdl.setStart(0);
}
result.add(key.toString(), sdl);
}
return result;
}
public SolrDocumentList mergeSolrDocumentList(SolrDocumentList one,
SolrDocumentList two, int maxSize, String idField) {
List<SolrDocument> l = new ArrayList<SolrDocument>();
// De-dup records sets. Shouldn't happen if indexed correctly.
Map<String,SolrDocument> map = new HashMap<String,SolrDocument>();
for (SolrDocument doc : one) {
Object id = doc.getFieldValue(idField);
assert id != null : doc.toString();
map.put(id.toString(), doc);
}
for (SolrDocument doc : two) {
map.put(doc.getFieldValue(idField).toString(), doc);
}
l = new ArrayList<SolrDocument>(map.values());
// Comparator to sort docs based on score. null scores/docs are set to 0.
// hmm...we are ordering by scores that are not really comparable...
Comparator<SolrDocument> c = new Comparator<SolrDocument>() {
public int compare(SolrDocument o1, SolrDocument o2) {
Float f1 = getFloat(o1);
Float f2 = getFloat(o2);
return f2.compareTo(f1);
}
private Float getFloat(SolrDocument doc) {
Float f = 0f;
if (doc != null) {
Object o = doc.getFieldValue("score");
if (o != null && o instanceof Float) {
f = (Float) o;
}
}
return f;
}
};
Collections.sort(l, c);
// Truncate list to maxSize
if (l.size() > maxSize) {
l = l.subList(0, maxSize);
}
// Create SolrDocumentList Attributes from originals
SolrDocumentList result = new SolrDocumentList();
result.addAll(l);
result.setMaxScore(Math.max(one.getMaxScore(), two.getMaxScore()));
result.setNumFound(one.getNumFound() + two.getNumFound());
result.setStart(Math.min(one.getStart(), two.getStart()));
return result;
}
ShardRequest buildShardQuery(ResponseBuilder rb, String q, String key) {
ShardRequest s = new ShardRequest();
s.params = new ModifiableSolrParams(rb.req.getParams());
s.purpose |= ShardRequest.PURPOSE_GET_MLT_RESULTS;
// Maybe unnecessary, but safe.
s.purpose |= ShardRequest.PURPOSE_PRIVATE;
s.params.remove(ShardParams.SHARDS);
// s.params.remove(MoreLikeThisComponent.COMPONENT_NAME);
// needed to correlate results
s.params.set(MoreLikeThisComponent.DIST_DOC_ID, key);
s.params.set(CommonParams.START, 0);
int mltcount = s.params.getInt(MoreLikeThisParams.DOC_COUNT, 20);
s.params.set(CommonParams.ROWS, mltcount);
// adding score to rank moreLikeThis
s.params.remove(CommonParams.FL);
// Should probably add something like this:
// String fl = s.params.get(MoreLikeThisParams.RETURN_FL, "*");
// if(fl != null){
// s.params.set(CommonParams.FL, fl + ",score");
// }
String id = rb.req.getSchema().getUniqueKeyField()
.getName();
s.params.set(CommonParams.FL, "score," + id);
s.params.set("sort", "score desc");
// MLT Query is submitted as normal query to shards.
s.params.set(CommonParams.Q, q);
s.shards = ShardRequest.ALL_SHARDS;
return s;
}
ShardRequest buildMLTQuery(ResponseBuilder rb, String q) {
ShardRequest s = new ShardRequest();
s.params = new ModifiableSolrParams();
s.params.set(CommonParams.START, 0);
String id = rb.req.getSchema().getUniqueKeyField() .getName();
s.params.set(CommonParams.FL, "score," + id);
// MLT Query is submitted as normal query to shards.
s.params.set(CommonParams.Q, q);
s.shards = ShardRequest.ALL_SHARDS;
return s;
}
NamedList<DocList> getMoreLikeThese(ResponseBuilder rb,
SolrIndexSearcher searcher, DocList docs, int flags) throws IOException {
SolrParams p = rb.req.getParams();
IndexSchema schema = searcher.getSchema();
MoreLikeThisHandler.MoreLikeThisHelper mltHelper
= new MoreLikeThisHandler.MoreLikeThisHelper( p, searcher );
MoreLikeThisHandler.MoreLikeThisHelper mltHelper = new MoreLikeThisHandler.MoreLikeThisHelper(
p, searcher);
NamedList<DocList> mlt = new SimpleOrderedMap<DocList>();
DocIterator iterator = docs.iterator();
SimpleOrderedMap<Object> dbg = null;
if( rb.isDebug() ){
if (rb.isDebug()) {
dbg = new SimpleOrderedMap<Object>();
}
while( iterator.hasNext() ) {
while (iterator.hasNext()) {
int id = iterator.nextDoc();
int rows = p.getInt( MoreLikeThisParams.DOC_COUNT, 5 );
DocListAndSet sim = mltHelper.getMoreLikeThis( id, 0, rows, null, null, flags );
String name = schema.printableUniqueKey( searcher.doc( id ) );
int rows = p.getInt(MoreLikeThisParams.DOC_COUNT, 20);
DocListAndSet sim = mltHelper.getMoreLikeThis(id, 0, rows, null, null,
flags);
String name = schema.printableUniqueKey(searcher.doc(id));
mlt.add(name, sim.docList);
if( dbg != null ){
if (dbg != null) {
SimpleOrderedMap<Object> docDbg = new SimpleOrderedMap<Object>();
docDbg.add( "rawMLTQuery", mltHelper.getRawMLTQuery().toString() );
docDbg.add( "boostedMLTQuery", mltHelper.getBoostedMLTQuery().toString() );
docDbg.add( "realMLTQuery", mltHelper.getRealMLTQuery().toString() );
docDbg.add("rawMLTQuery", mltHelper.getRawMLTQuery().toString());
docDbg
.add("boostedMLTQuery", mltHelper.getBoostedMLTQuery().toString());
docDbg.add("realMLTQuery", mltHelper.getRealMLTQuery().toString());
SimpleOrderedMap<Object> explains = new SimpleOrderedMap<Object>();
DocIterator mltIte = sim.docList.iterator();
while( mltIte.hasNext() ){
while (mltIte.hasNext()) {
int mltid = mltIte.nextDoc();
String key = schema.printableUniqueKey( searcher.doc( mltid ) );
explains.add( key, searcher.explain( mltHelper.getRealMLTQuery(), mltid ) );
String key = schema.printableUniqueKey(searcher.doc(mltid));
explains.add(key,
searcher.explain(mltHelper.getRealMLTQuery(), mltid));
}
docDbg.add( "explain", explains );
dbg.add( name, docDbg );
docDbg.add("explain", explains);
dbg.add(name, docDbg);
}
}
// add debug information
if( dbg != null ){
rb.addDebugInfo( "moreLikeThis", dbg );
if (dbg != null) {
rb.addDebugInfo("moreLikeThis", dbg);
}
return mlt;
}
/////////////////////////////////////////////
/// SolrInfoMBean
////////////////////////////////////////////
// ///////////////////////////////////////////
// / SolrInfoMBean
// //////////////////////////////////////////
@Override
public String getDescription() {

View File

@ -38,6 +38,7 @@ public class ShardRequest {
public final static int PURPOSE_GET_STATS =0x200;
public final static int PURPOSE_GET_TERMS =0x400;
public final static int PURPOSE_GET_TOP_GROUPS =0x800;
public final static int PURPOSE_GET_MLT_RESULTS =0x1000;
public int purpose; // the purpose of this request

View File

@ -400,6 +400,10 @@
<arr name="last-components">
<str>spellcheck</str>
</arr>
</requestHandler>
<requestHandler name="mltrh" class="org.apache.solr.handler.component.SearchHandler">
</requestHandler>
<searchComponent name="tvComponent" class="org.apache.solr.handler.component.TermVectorComponent"/>

View File

@ -0,0 +1,123 @@
package org.apache.solr.handler.component;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.BaseDistributedSearchTestCase;
import org.junit.BeforeClass;
/**
* Test for distributed MoreLikeThisComponent's
*
* @since solr 4.1
*
* @see org.apache.solr.handler.component.MoreLikeThisComponent
*/
@Slow
public class DistributedMLTComponentTest extends BaseDistributedSearchTestCase {
private String requestHandlerName;
public DistributedMLTComponentTest()
{
fixShardCount=true;
shardCount=2;
stress=0;
}
@BeforeClass
public static void beforeClass() throws Exception {
}
@Override
public void setUp() throws Exception {
requestHandlerName = "mltrh";
super.setUp();
}
@Override
public void tearDown() throws Exception {
super.tearDown();
}
@Override
public void doTest() throws Exception {
del("*:*");
index(id, "1", "lowerfilt", "toyota");
index(id, "2", "lowerfilt", "chevrolet");
index(id, "3", "lowerfilt", "suzuki");
index(id, "4", "lowerfilt", "ford");
index(id, "5", "lowerfilt", "ferrari");
index(id, "6", "lowerfilt", "jaguar");
index(id, "7", "lowerfilt", "mclaren moon or the moon and moon");
index(id, "8", "lowerfilt", "sonata");
index(id, "9", "lowerfilt", "The quick red fox jumped over the lazy big and large brown dogs.");
index(id, "10", "lowerfilt", "blue");
index(id, "12", "lowerfilt", "glue");
index(id, "13", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "14", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "15", "lowerfilt", "The fat red fox jumped over the lazy brown dogs.");
index(id, "16", "lowerfilt", "The slim red fox jumped over the lazy brown dogs.");
index(id, "17", "lowerfilt", "The quote red fox jumped moon over the lazy brown dogs moon. Of course moon");
index(id, "18", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "19", "lowerfilt", "The hose red fox jumped over the lazy brown dogs.");
index(id, "20", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "21", "lowerfilt", "The court red fox jumped over the lazy brown dogs.");
index(id, "22", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "23", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "24", "lowerfilt", "The file red fox jumped over the lazy brown dogs.");
index(id, "25", "lowerfilt", "rod fix");
commit();
handle.clear();
handle.put("QTime", SKIPVAL);
handle.put("timestamp", SKIPVAL);
handle.put("maxScore", SKIPVAL);
// we care only about the mlt results
handle.put("response", SKIP);
// currently distrib mlt is sorting by score (even though it's not really comparable across shards)
// so it may not match the sort of single shard mlt
handle.put("17", UNORDERED);
query("q", "match_none", "mlt", "true", "mlt.fl", "lowerfilt", "qt", requestHandlerName, "shards.qt", requestHandlerName);
query("q", "lowerfilt:sonata", "mlt", "true", "mlt.fl", "lowerfilt", "qt", requestHandlerName, "shards.qt", requestHandlerName);
query("q", "lowerfilt:moon", "fl", id, "sort", "id desc", "mlt", "true", "mlt.fl", "lowerfilt", "qt", requestHandlerName, "shards.qt", requestHandlerName);
handle.put("24", UNORDERED);
handle.put("23", UNORDERED);
handle.put("22", UNORDERED);
handle.put("21", UNORDERED);
handle.put("20", UNORDERED);
handle.put("19", UNORDERED);
handle.put("18", UNORDERED);
handle.put("17", UNORDERED);
handle.put("16", UNORDERED);
handle.put("15", UNORDERED);
handle.put("14", UNORDERED);
handle.put("13", UNORDERED);
query("q", "lowerfilt:fox", "fl", id, "sort", "id desc", "mlt", "true", "mlt.fl", "lowerfilt", "qt", requestHandlerName, "shards.qt", requestHandlerName);
//query("q", "*:*", "mlt", "true", "mlt.fl", "lowerfilt", "qt", requestHandlerName, "shards.qt", requestHandlerName);
}
}