SOLR-5351: Fixed More Like This Handler to use all fields provided in mlt.fl when used with content stream. The similarity is calculated between the content stream's value and all fields listed in mlt.fl.

This commit is contained in:
Dawid Weiss 2018-05-25 11:39:42 +02:00
parent 0a1de2c4a5
commit 41ecad9897
3 changed files with 141 additions and 67 deletions

View File

@ -131,6 +131,10 @@ New Features
Bug Fixes
----------------------
* SOLR-5351: Fixed More Like This Handler to use all fields provided in mlt.fl when used with
content stream. The similarity is calculated between the content stream's value and all
fields listed in mlt.fl. (Dawid Weiss)
* SOLR-12103: Raise CryptoKeys.DEFAULT_KEYPAIR_LENGTH from 1024 to 2048. (Mark Miller)
* SOLR-12107: Fixed a error in [child] transformer that could ocur if documentCache was not used (hossman)
@ -425,6 +429,7 @@ Upgrade Notes
New Features
----------------------
* SOLR-11285: Simulation framework for autoscaling. (ab)
* LUCENE-2899: In the Solr analysis-extras contrib, added support for the

View File

@ -21,6 +21,8 @@ import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@ -36,6 +38,7 @@ import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.StringUtils;
import org.apache.solr.common.params.CommonParams;
@ -80,7 +83,13 @@ public class MoreLikeThisHandler extends RequestHandlerBase
private static final Pattern splitList = Pattern.compile(",| ");
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
static final String ERR_MSG_QUERY_OR_TEXT_REQUIRED =
"MoreLikeThis requires either a query (?q=) or text to find similar documents.";
static final String ERR_MSG_SINGLE_STREAM_ONLY =
"MoreLikeThis does not support multiple ContentStreams";
@Override
public void init(NamedList args) {
super.init(args);
@ -156,7 +165,7 @@ public class MoreLikeThisHandler extends RequestHandlerBase
}
if (iter.hasNext()) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"MoreLikeThis does not support multiple ContentStreams");
ERR_MSG_SINGLE_STREAM_ONLY);
}
}
}
@ -191,7 +200,7 @@ public class MoreLikeThisHandler extends RequestHandlerBase
}
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"MoreLikeThis requires either a query (?q=) or text to find similar documents.");
ERR_MSG_QUERY_OR_TEXT_REQUIRED);
}
} finally {
@ -411,10 +420,31 @@ public class MoreLikeThisHandler extends RequestHandlerBase
public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
{
// analyzing with the first field: previous (stupid) behavior
rawMLTQuery = mlt.like(mlt.getFieldNames()[0], reader);
// SOLR-5351: if only check against a single field, use the reader directly. Otherwise we
// repeat the stream's content for multiple fields so that query terms can be pulled from any
// of those fields.
String [] fields = mlt.getFieldNames();
if (fields.length == 1) {
rawMLTQuery = mlt.like(fields[0], reader);
} else {
CharsRefBuilder buffered = new CharsRefBuilder();
char [] chunk = new char [1024];
int len;
while ((len = reader.read(chunk)) >= 0) {
buffered.append(chunk, 0, len);
}
Collection<Object> streamValue = Collections.singleton(buffered.get().toString());
Map<String, Collection<Object>> multifieldDoc = new HashMap<>(fields.length);
for (String field : fields) {
multifieldDoc.put(field, streamValue);
}
rawMLTQuery = mlt.like(multifieldDoc);
}
boostedMLTQuery = getBoostedQuery( rawMLTQuery );
if( terms != null ) {
if (terms != null) {
fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms );
}
DocListAndSet results = new DocListAndSet();

View File

@ -18,6 +18,7 @@ package org.apache.solr.handler;
import java.util.ArrayList;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.*;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
@ -29,12 +30,10 @@ import org.apache.solr.response.SolrQueryResponse;
import org.junit.BeforeClass;
import org.junit.Test;
/**
* TODO -- this needs to actually test the results/query etc
*/
public class MoreLikeThisHandlerTest extends SolrTestCaseJ4 {
@BeforeClass
public static void moreLikeThisBeforeClass() throws Exception {
initCore("solrconfig.xml", "schema.xml");
@ -47,27 +46,7 @@ public class MoreLikeThisHandlerTest extends SolrTestCaseJ4 {
MoreLikeThisHandler mlt = new MoreLikeThisHandler();
ModifiableSolrParams params = new ModifiableSolrParams();
SolrQueryRequestBase req = new SolrQueryRequestBase( core, params) {};
// requires 'q' or single content stream
try {
mlt.handleRequestBody( req, new SolrQueryResponse() );
}
catch( Exception ex ) {} // expected
// requires 'q' or single content stream
try {
ArrayList<ContentStream> streams = new ArrayList<>( 2 );
streams.add( new ContentStreamBase.StringStream( "hello" ) );
streams.add( new ContentStreamBase.StringStream( "there" ) );
req.setContentStreams( streams );
mlt.handleRequestBody( req, new SolrQueryResponse() );
}
catch( Exception ex ) {} // expected
finally {
req.close();
}
assertU(adoc("id","42","name","Tom Cruise","subword","Top Gun","subword","Risky Business","subword","The Color of Money","subword","Minority Report","subword", "Days of Thunder","subword", "Eyes Wide Shut","subword", "Far and Away", "foo_ti","10"));
assertU(adoc("id","43","name","Tom Hanks","subword","The Green Mile","subword","Forest Gump","subword","Philadelphia Story","subword","Big","subword","Cast Away", "foo_ti","10"));
assertU(adoc("id","44","name","Harrison Ford","subword","Star Wars","subword","Indiana Jones","subword","Patriot Games","subword","Regarding Henry"));
@ -75,7 +54,6 @@ public class MoreLikeThisHandlerTest extends SolrTestCaseJ4 {
assertU(adoc("id","46","name","Nicole Kidman","subword","Batman","subword","Days of Thunder","subword","Eyes Wide Shut","subword","Far and Away"));
assertU(commit());
params.set(CommonParams.Q, "id:42");
params.set(MoreLikeThisParams.MLT, "true");
params.set(MoreLikeThisParams.SIMILARITY_FIELDS, "name,subword");
params.set(MoreLikeThisParams.INTERESTING_TERMS, "details");
@ -83,67 +61,128 @@ public class MoreLikeThisHandlerTest extends SolrTestCaseJ4 {
params.set(MoreLikeThisParams.MIN_DOC_FREQ,"1");
params.set("indent","true");
SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params);
assertQ("morelikethis - tom cruise",mltreq
,"//result/doc[1]/str[@name='id'][.='46']"
,"//result/doc[2]/str[@name='id'][.='43']");
// requires 'q' or a single content stream
SolrException ex = expectThrows(SolrException.class, () -> {
try (SolrQueryRequestBase req = new SolrQueryRequestBase(core, params) {}) {
mlt.handleRequestBody(req, new SolrQueryResponse());
}
});
assertEquals(ex.getMessage(), MoreLikeThisHandler.ERR_MSG_QUERY_OR_TEXT_REQUIRED);
assertEquals(ex.code(), SolrException.ErrorCode.BAD_REQUEST.code);
// requires a single content stream (more than one is not supported).
ex = expectThrows(SolrException.class, () -> {
try (SolrQueryRequestBase req = new SolrQueryRequestBase(core, params) {}) {
ArrayList<ContentStream> streams = new ArrayList<>(2);
streams.add(new ContentStreamBase.StringStream("hello"));
streams.add(new ContentStreamBase.StringStream("there"));
req.setContentStreams(streams);
mlt.handleRequestBody(req, new SolrQueryResponse());
}
});
assertEquals(ex.getMessage(), MoreLikeThisHandler.ERR_MSG_SINGLE_STREAM_ONLY);
assertEquals(ex.code(), SolrException.ErrorCode.BAD_REQUEST.code);
params.set(CommonParams.Q, "id:42");
try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
assertQ("morelikethis - tom cruise", mltreq,
"//result/doc[1]/str[@name='id'][.='46']",
"//result/doc[2]/str[@name='id'][.='43']");
}
params.set(MoreLikeThisParams.BOOST, "true");
mltreq.close(); mltreq = new LocalSolrQueryRequest( core, params);
assertQ("morelikethis - tom cruise",mltreq
,"//result/doc[1]/str[@name='id'][.='46']"
,"//result/doc[2]/str[@name='id'][.='43']");
try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
assertQ("morelikethis - tom cruise", mltreq,
"//result/doc[1]/str[@name='id'][.='46']",
"//result/doc[2]/str[@name='id'][.='43']");
}
params.set(CommonParams.Q, "id:44");
mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params);
assertQ("morelike this - harrison ford",mltreq
,"//result/doc[1]/str[@name='id'][.='45']");
try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
assertQ("morelike this - harrison ford", mltreq,
"//result/doc[1]/str[@name='id'][.='45']");
}
// test MoreLikeThis debug
params.set(CommonParams.DEBUG_QUERY, "true");
mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params);
assertQ("morelike this - harrison ford",mltreq
,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='rawMLTQuery']"
,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='boostedMLTQuery']"
,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='realMLTQuery']"
,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/lst[@name='explain']/str[@name='45']"
);
try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
assertQ("morelike this - harrison ford", mltreq,
"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='rawMLTQuery']",
"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='boostedMLTQuery']",
"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='realMLTQuery']",
"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/lst[@name='explain']/str[@name='45']"
);
}
// test that qparser plugins work
params.remove(CommonParams.DEBUG_QUERY);
params.set(CommonParams.Q, "{!field f=id}44");
mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params);
assertQ(mltreq
,"//result/doc[1]/str[@name='id'][.='45']");
try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
assertQ(mltreq, "//result/doc[1]/str[@name='id'][.='45']");
}
params.set(CommonParams.Q, "id:42");
params.set(MoreLikeThisParams.QF,"name^5.0 subword^0.1");
mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params);
assertQ("morelikethis with weights",mltreq
,"//result/doc[1]/str[@name='id'][.='43']"
,"//result/doc[2]/str[@name='id'][.='46']");
try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
assertQ("morelikethis with weights", mltreq,
"//result/doc[1]/str[@name='id'][.='43']",
"//result/doc[2]/str[@name='id'][.='46']");
}
// test that qparser plugins work w/ the MoreLikeThisHandler
params.set(CommonParams.QT, "/mlt");
params.set(CommonParams.Q, "{!field f=id}44");
mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params);
assertQ(mltreq
,"//result/doc[1]/str[@name='id'][.='45']");
try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
assertQ(mltreq, "//result/doc[1]/str[@name='id'][.='45']");
}
// test that debugging works (test for MoreLikeThis*Handler*)
params.set(CommonParams.QT, "/mlt");
params.set(CommonParams.DEBUG_QUERY, "true");
mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params);
assertQ(mltreq
,"//result/doc[1]/str[@name='id'][.='45']"
,"//lst[@name='debug']/lst[@name='explain']"
);
try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
assertQ(mltreq,
"//result/doc[1]/str[@name='id'][.='45']",
"//lst[@name='debug']/lst[@name='explain']"
);
}
}
// params.put(MoreLikeThisParams.QF,new String[]{"foo_ti"});
// String response = h.query(mltreq);
// System.out.println(response);
@Test
public void testMultifieldSimilarity() throws Exception
{
SolrCore core = h.getCore();
MoreLikeThisHandler mlt = new MoreLikeThisHandler();
mltreq.close();
ModifiableSolrParams params = new ModifiableSolrParams();
assertU(adoc("id", "1", "name", "aaa bbb ccc", "subword", " zzz"));
assertU(adoc("id", "2", "name", " bbb ccc", "subword", " bbb zzz"));
assertU(adoc("id", "3", "name", " ccc", "subword", "aaa bbb zzz"));
assertU(adoc("id", "4", "name", " ccc", "subword", " bbb "));
assertU(commit());
params.set(CommonParams.QT, "/mlt");
params.set(MoreLikeThisParams.MLT, "true");
params.set(MoreLikeThisParams.SIMILARITY_FIELDS, "name,subword");
params.set(MoreLikeThisParams.INTERESTING_TERMS, "details");
params.set(MoreLikeThisParams.MIN_TERM_FREQ, "1");
params.set(MoreLikeThisParams.MIN_DOC_FREQ, "2");
params.set(MoreLikeThisParams.BOOST, true);
params.set("indent", "true");
try (SolrQueryRequestBase req = new SolrQueryRequestBase(core, params) {}) {
ArrayList<ContentStream> streams = new ArrayList<>(2);
streams.add(new ContentStreamBase.StringStream("bbb", "zzz"));
req.setContentStreams(streams);
// Make sure we have terms from both fields in the interestingTerms array and all documents have been
// retrieved as matching.
assertQ(req,
"//lst[@name = 'interestingTerms']/float[@name = 'subword:bbb']",
"//lst[@name = 'interestingTerms']/float[@name = 'name:bbb']",
"//result[@name = 'response' and @numFound = '4']");
}
}
}