From 41ecad9897bb8949bfed730cd988aec58aa69775 Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Fri, 25 May 2018 11:39:42 +0200 Subject: [PATCH] SOLR-5351: Fixed More Like This Handler to use all fields provided in mlt.fl when used with content stream. The similarity is calculated between the content stream's value and all fields listed in mlt.fl. --- solr/CHANGES.txt | 5 + .../solr/handler/MoreLikeThisHandler.java | 42 ++++- .../solr/handler/MoreLikeThisHandlerTest.java | 161 +++++++++++------- 3 files changed, 141 insertions(+), 67 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 9159e92b6fb..3218ab390d7 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -131,6 +131,10 @@ New Features Bug Fixes ---------------------- +* SOLR-5351: Fixed More Like This Handler to use all fields provided in mlt.fl when used with + content stream. The similarity is calculated between the content stream's value and all + fields listed in mlt.fl. (Dawid Weiss) + * SOLR-12103: Raise CryptoKeys.DEFAULT_KEYPAIR_LENGTH from 1024 to 2048. (Mark Miller) * SOLR-12107: Fixed a error in [child] transformer that could ocur if documentCache was not used (hossman) @@ -425,6 +429,7 @@ Upgrade Notes New Features ---------------------- + * SOLR-11285: Simulation framework for autoscaling. (ab) * LUCENE-2899: In the Solr analysis-extras contrib, added support for the diff --git a/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java b/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java index 62f1016bbaf..cce2939c583 100644 --- a/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java @@ -21,6 +21,8 @@ import java.io.Reader; import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -36,6 +38,7 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.util.CharsRefBuilder; import org.apache.solr.common.SolrException; import org.apache.solr.common.StringUtils; import org.apache.solr.common.params.CommonParams; @@ -80,7 +83,13 @@ public class MoreLikeThisHandler extends RequestHandlerBase private static final Pattern splitList = Pattern.compile(",| "); private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - + + static final String ERR_MSG_QUERY_OR_TEXT_REQUIRED = + "MoreLikeThis requires either a query (?q=) or text to find similar documents."; + + static final String ERR_MSG_SINGLE_STREAM_ONLY = + "MoreLikeThis does not support multiple ContentStreams"; + @Override public void init(NamedList args) { super.init(args); @@ -156,7 +165,7 @@ public class MoreLikeThisHandler extends RequestHandlerBase } if (iter.hasNext()) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, - "MoreLikeThis does not support multiple ContentStreams"); + ERR_MSG_SINGLE_STREAM_ONLY); } } } @@ -191,7 +200,7 @@ public class MoreLikeThisHandler extends RequestHandlerBase } } else { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, - "MoreLikeThis requires either a query (?q=) or text to find similar documents."); + ERR_MSG_QUERY_OR_TEXT_REQUIRED); } } finally { @@ -411,10 +420,31 @@ public class MoreLikeThisHandler extends RequestHandlerBase public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List filters, List terms, int flags ) throws IOException { - // analyzing with the first field: previous (stupid) behavior - rawMLTQuery = mlt.like(mlt.getFieldNames()[0], reader); + // SOLR-5351: if only check against a single field, use the reader directly. Otherwise we + // repeat the stream's content for multiple fields so that query terms can be pulled from any + // of those fields. + String [] fields = mlt.getFieldNames(); + if (fields.length == 1) { + rawMLTQuery = mlt.like(fields[0], reader); + } else { + CharsRefBuilder buffered = new CharsRefBuilder(); + char [] chunk = new char [1024]; + int len; + while ((len = reader.read(chunk)) >= 0) { + buffered.append(chunk, 0, len); + } + + Collection streamValue = Collections.singleton(buffered.get().toString()); + Map> multifieldDoc = new HashMap<>(fields.length); + for (String field : fields) { + multifieldDoc.put(field, streamValue); + } + + rawMLTQuery = mlt.like(multifieldDoc); + } + boostedMLTQuery = getBoostedQuery( rawMLTQuery ); - if( terms != null ) { + if (terms != null) { fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms ); } DocListAndSet results = new DocListAndSet(); diff --git a/solr/core/src/test/org/apache/solr/handler/MoreLikeThisHandlerTest.java b/solr/core/src/test/org/apache/solr/handler/MoreLikeThisHandlerTest.java index aa63ce32576..6b80014c0fb 100644 --- a/solr/core/src/test/org/apache/solr/handler/MoreLikeThisHandlerTest.java +++ b/solr/core/src/test/org/apache/solr/handler/MoreLikeThisHandlerTest.java @@ -18,6 +18,7 @@ package org.apache.solr.handler; import java.util.ArrayList; import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.SolrException; import org.apache.solr.common.params.*; import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.util.ContentStreamBase; @@ -29,12 +30,10 @@ import org.apache.solr.response.SolrQueryResponse; import org.junit.BeforeClass; import org.junit.Test; - /** * TODO -- this needs to actually test the results/query etc */ public class MoreLikeThisHandlerTest extends SolrTestCaseJ4 { - @BeforeClass public static void moreLikeThisBeforeClass() throws Exception { initCore("solrconfig.xml", "schema.xml"); @@ -47,27 +46,7 @@ public class MoreLikeThisHandlerTest extends SolrTestCaseJ4 { MoreLikeThisHandler mlt = new MoreLikeThisHandler(); ModifiableSolrParams params = new ModifiableSolrParams(); - SolrQueryRequestBase req = new SolrQueryRequestBase( core, params) {}; - - // requires 'q' or single content stream - try { - mlt.handleRequestBody( req, new SolrQueryResponse() ); - } - catch( Exception ex ) {} // expected - // requires 'q' or single content stream - try { - ArrayList streams = new ArrayList<>( 2 ); - streams.add( new ContentStreamBase.StringStream( "hello" ) ); - streams.add( new ContentStreamBase.StringStream( "there" ) ); - req.setContentStreams( streams ); - mlt.handleRequestBody( req, new SolrQueryResponse() ); - } - catch( Exception ex ) {} // expected - finally { - req.close(); - } - assertU(adoc("id","42","name","Tom Cruise","subword","Top Gun","subword","Risky Business","subword","The Color of Money","subword","Minority Report","subword", "Days of Thunder","subword", "Eyes Wide Shut","subword", "Far and Away", "foo_ti","10")); assertU(adoc("id","43","name","Tom Hanks","subword","The Green Mile","subword","Forest Gump","subword","Philadelphia Story","subword","Big","subword","Cast Away", "foo_ti","10")); assertU(adoc("id","44","name","Harrison Ford","subword","Star Wars","subword","Indiana Jones","subword","Patriot Games","subword","Regarding Henry")); @@ -75,7 +54,6 @@ public class MoreLikeThisHandlerTest extends SolrTestCaseJ4 { assertU(adoc("id","46","name","Nicole Kidman","subword","Batman","subword","Days of Thunder","subword","Eyes Wide Shut","subword","Far and Away")); assertU(commit()); - params.set(CommonParams.Q, "id:42"); params.set(MoreLikeThisParams.MLT, "true"); params.set(MoreLikeThisParams.SIMILARITY_FIELDS, "name,subword"); params.set(MoreLikeThisParams.INTERESTING_TERMS, "details"); @@ -83,67 +61,128 @@ public class MoreLikeThisHandlerTest extends SolrTestCaseJ4 { params.set(MoreLikeThisParams.MIN_DOC_FREQ,"1"); params.set("indent","true"); - SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params); - assertQ("morelikethis - tom cruise",mltreq - ,"//result/doc[1]/str[@name='id'][.='46']" - ,"//result/doc[2]/str[@name='id'][.='43']"); + // requires 'q' or a single content stream + SolrException ex = expectThrows(SolrException.class, () -> { + try (SolrQueryRequestBase req = new SolrQueryRequestBase(core, params) {}) { + mlt.handleRequestBody(req, new SolrQueryResponse()); + } + }); + assertEquals(ex.getMessage(), MoreLikeThisHandler.ERR_MSG_QUERY_OR_TEXT_REQUIRED); + assertEquals(ex.code(), SolrException.ErrorCode.BAD_REQUEST.code); + + // requires a single content stream (more than one is not supported). + ex = expectThrows(SolrException.class, () -> { + try (SolrQueryRequestBase req = new SolrQueryRequestBase(core, params) {}) { + ArrayList streams = new ArrayList<>(2); + streams.add(new ContentStreamBase.StringStream("hello")); + streams.add(new ContentStreamBase.StringStream("there")); + req.setContentStreams(streams); + mlt.handleRequestBody(req, new SolrQueryResponse()); + } + }); + assertEquals(ex.getMessage(), MoreLikeThisHandler.ERR_MSG_SINGLE_STREAM_ONLY); + assertEquals(ex.code(), SolrException.ErrorCode.BAD_REQUEST.code); + + params.set(CommonParams.Q, "id:42"); + + try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) { + assertQ("morelikethis - tom cruise", mltreq, + "//result/doc[1]/str[@name='id'][.='46']", + "//result/doc[2]/str[@name='id'][.='43']"); + } params.set(MoreLikeThisParams.BOOST, "true"); - mltreq.close(); mltreq = new LocalSolrQueryRequest( core, params); - assertQ("morelikethis - tom cruise",mltreq - ,"//result/doc[1]/str[@name='id'][.='46']" - ,"//result/doc[2]/str[@name='id'][.='43']"); + + try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) { + assertQ("morelikethis - tom cruise", mltreq, + "//result/doc[1]/str[@name='id'][.='46']", + "//result/doc[2]/str[@name='id'][.='43']"); + } params.set(CommonParams.Q, "id:44"); - mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params); - assertQ("morelike this - harrison ford",mltreq - ,"//result/doc[1]/str[@name='id'][.='45']"); + try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) { + assertQ("morelike this - harrison ford", mltreq, + "//result/doc[1]/str[@name='id'][.='45']"); + } // test MoreLikeThis debug params.set(CommonParams.DEBUG_QUERY, "true"); - mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params); - assertQ("morelike this - harrison ford",mltreq - ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='rawMLTQuery']" - ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='boostedMLTQuery']" - ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='realMLTQuery']" - ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/lst[@name='explain']/str[@name='45']" - ); + try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) { + assertQ("morelike this - harrison ford", mltreq, + "//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='rawMLTQuery']", + "//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='boostedMLTQuery']", + "//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='realMLTQuery']", + "//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/lst[@name='explain']/str[@name='45']" + ); + } // test that qparser plugins work params.remove(CommonParams.DEBUG_QUERY); params.set(CommonParams.Q, "{!field f=id}44"); - mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params); - assertQ(mltreq - ,"//result/doc[1]/str[@name='id'][.='45']"); + try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) { + assertQ(mltreq, "//result/doc[1]/str[@name='id'][.='45']"); + } params.set(CommonParams.Q, "id:42"); params.set(MoreLikeThisParams.QF,"name^5.0 subword^0.1"); - mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params); - assertQ("morelikethis with weights",mltreq - ,"//result/doc[1]/str[@name='id'][.='43']" - ,"//result/doc[2]/str[@name='id'][.='46']"); - + try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) { + assertQ("morelikethis with weights", mltreq, + "//result/doc[1]/str[@name='id'][.='43']", + "//result/doc[2]/str[@name='id'][.='46']"); + } // test that qparser plugins work w/ the MoreLikeThisHandler params.set(CommonParams.QT, "/mlt"); params.set(CommonParams.Q, "{!field f=id}44"); - mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params); - assertQ(mltreq - ,"//result/doc[1]/str[@name='id'][.='45']"); + try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) { + assertQ(mltreq, "//result/doc[1]/str[@name='id'][.='45']"); + } // test that debugging works (test for MoreLikeThis*Handler*) params.set(CommonParams.QT, "/mlt"); params.set(CommonParams.DEBUG_QUERY, "true"); - mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params); - assertQ(mltreq - ,"//result/doc[1]/str[@name='id'][.='45']" - ,"//lst[@name='debug']/lst[@name='explain']" - ); + try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) { + assertQ(mltreq, + "//result/doc[1]/str[@name='id'][.='45']", + "//lst[@name='debug']/lst[@name='explain']" + ); + } + } - // params.put(MoreLikeThisParams.QF,new String[]{"foo_ti"}); - // String response = h.query(mltreq); - // System.out.println(response); + @Test + public void testMultifieldSimilarity() throws Exception + { + SolrCore core = h.getCore(); + MoreLikeThisHandler mlt = new MoreLikeThisHandler(); - mltreq.close(); + ModifiableSolrParams params = new ModifiableSolrParams(); + + assertU(adoc("id", "1", "name", "aaa bbb ccc", "subword", " zzz")); + assertU(adoc("id", "2", "name", " bbb ccc", "subword", " bbb zzz")); + assertU(adoc("id", "3", "name", " ccc", "subword", "aaa bbb zzz")); + assertU(adoc("id", "4", "name", " ccc", "subword", " bbb ")); + assertU(commit()); + + params.set(CommonParams.QT, "/mlt"); + params.set(MoreLikeThisParams.MLT, "true"); + params.set(MoreLikeThisParams.SIMILARITY_FIELDS, "name,subword"); + params.set(MoreLikeThisParams.INTERESTING_TERMS, "details"); + params.set(MoreLikeThisParams.MIN_TERM_FREQ, "1"); + params.set(MoreLikeThisParams.MIN_DOC_FREQ, "2"); + params.set(MoreLikeThisParams.BOOST, true); + params.set("indent", "true"); + + try (SolrQueryRequestBase req = new SolrQueryRequestBase(core, params) {}) { + ArrayList streams = new ArrayList<>(2); + streams.add(new ContentStreamBase.StringStream("bbb", "zzz")); + req.setContentStreams(streams); + + // Make sure we have terms from both fields in the interestingTerms array and all documents have been + // retrieved as matching. + assertQ(req, + "//lst[@name = 'interestingTerms']/float[@name = 'subword:bbb']", + "//lst[@name = 'interestingTerms']/float[@name = 'name:bbb']", + "//result[@name = 'response' and @numFound = '4']"); + } } }