From 2b4e3dd941a7a88274f2a86f18ea57a9d95e4364 Mon Sep 17 00:00:00 2001 From: anshum Date: Mon, 9 Jan 2017 13:05:21 -0800 Subject: [PATCH] SOLR-9644: Fixed SimpleMLTQParser and CloudMLTQParser to handle boosts properly and CloudMLTQParser to only extract actual values from IndexableField type fields to the filtered document. --- solr/CHANGES.txt | 4 ++ .../solr/search/mlt/CloudMLTQParser.java | 49 ++++++++++++------- .../solr/search/mlt/SimpleMLTQParser.java | 30 ++++++------ .../solr/search/mlt/CloudMLTQParserTest.java | 23 ++++++++- .../solr/search/mlt/SimpleMLTQParserTest.java | 33 +++++++++++-- 5 files changed, 102 insertions(+), 37 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index c79b3c69f09..2b79f04c313 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -345,6 +345,10 @@ Bug Fixes * SOLR-9883: Example schemaless solr config files can lead to invalid tlog replays: when updates are buffered, update processors ordered before DistributedUpdateProcessor, e.g. field normalization, are never run. (Steve Rowe) +* SOLR-9644: SimpleMLTQParser and CloudMLTQParser did not handle field boosts properly + and CloudMLTQParser included extra strings from the field definitions in the query. + (Ere Maijala via Anshum Gupta) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java b/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java index 0f46725eb27..945047b097a 100644 --- a/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java +++ b/solr/core/src/java/org/apache/solr/search/mlt/CloudMLTQParser.java @@ -22,6 +22,7 @@ import java.util.HashMap; import java.util.Map; import java.util.regex.Pattern; +import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.legacy.LegacyNumericUtils; import org.apache.lucene.queries.mlt.MoreLikeThis; @@ -64,73 +65,83 @@ public class CloudMLTQParser extends QParser { SolrException.ErrorCode.BAD_REQUEST, "Error completing MLT request. Could not fetch " + "document with id [" + id + "]"); } - + String[] qf = localParams.getParams("qf"); Map boostFields = new HashMap<>(); MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader()); - + mlt.setMinTermFreq(localParams.getInt("mintf", MoreLikeThis.DEFAULT_MIN_TERM_FREQ)); - mlt.setMinDocFreq(localParams.getInt("mindf", 0)); - mlt.setMinWordLen(localParams.getInt("minwl", MoreLikeThis.DEFAULT_MIN_WORD_LENGTH)); - mlt.setMaxWordLen(localParams.getInt("maxwl", MoreLikeThis.DEFAULT_MAX_WORD_LENGTH)); - mlt.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThis.DEFAULT_MAX_QUERY_TERMS)); - mlt.setMaxNumTokensParsed(localParams.getInt("maxntp", MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED)); - mlt.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThis.DEFAULT_MAX_DOC_FREQ)); - if(localParams.get("boost") != null) { - mlt.setBoost(localParams.getBool("boost")); - boostFields = SolrPluginUtils.parseFieldBoosts(qf); - } + Boolean boost = localParams.getBool("boost", MoreLikeThis.DEFAULT_BOOST); + mlt.setBoost(boost); mlt.setAnalyzer(req.getSchema().getIndexAnalyzer()); Map> filteredDocument = new HashMap<>(); - ArrayList fieldNames = new ArrayList<>(); + String[] fieldNames; if (qf != null) { + ArrayList fields = new ArrayList(); for (String fieldName : qf) { if (!StringUtils.isEmpty(fieldName)) { String[] strings = splitList.split(fieldName); for (String string : strings) { if (!StringUtils.isEmpty(string)) { - fieldNames.add(string); + fields.add(string); } } } } + // Parse field names and boosts from the fields + boostFields = SolrPluginUtils.parseFieldBoosts(fields.toArray(new String[0])); + fieldNames = boostFields.keySet().toArray(new String[0]); } else { + ArrayList fields = new ArrayList(); for (String field : doc.getFieldNames()) { // Only use fields that are stored and have an explicit analyzer. // This makes sense as the query uses tf/idf/.. for query construction. // We might want to relook and change this in the future though. SchemaField f = req.getSchema().getFieldOrNull(field); if (f != null && f.stored() && f.getType().isExplicitAnalyzer()) { - fieldNames.add(field); + fields.add(field); } } + fieldNames = fields.toArray(new String[0]); } - if( fieldNames.size() < 1 ) { + if (fieldNames.length < 1) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "MoreLikeThis requires at least one similarity field: qf" ); } - mlt.setFieldNames(fieldNames.toArray(new String[fieldNames.size()])); + mlt.setFieldNames(fieldNames); for (String field : fieldNames) { - filteredDocument.put(field, doc.getFieldValues(field)); + Collection fieldValues = doc.getFieldValues(field); + if (fieldValues != null) { + Collection values = new ArrayList(); + for (Object val : fieldValues) { + if (val instanceof IndexableField) { + values.add(((IndexableField)val).stringValue()); + } + else { + values.add(val); + } + } + filteredDocument.put(field, values); + } } try { Query rawMLTQuery = mlt.like(filteredDocument); BooleanQuery boostedMLTQuery = (BooleanQuery) rawMLTQuery; - if (boostFields.size() > 0) { + if (boost && boostFields.size() > 0) { BooleanQuery.Builder newQ = new BooleanQuery.Builder(); newQ.setMinimumNumberShouldMatch(boostedMLTQuery.getMinimumNumberShouldMatch()); diff --git a/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java b/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java index 50803df2e91..de6eb58286b 100644 --- a/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java +++ b/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java @@ -76,16 +76,13 @@ public class SimpleMLTQParser extends QParser { mlt.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThis.DEFAULT_MAX_QUERY_TERMS)); mlt.setMaxNumTokensParsed(localParams.getInt("maxntp", MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED)); mlt.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThis.DEFAULT_MAX_DOC_FREQ)); + Boolean boost = localParams.getBool("boost", false); + mlt.setBoost(boost); - // what happens if value is explicitly set to false? - if(localParams.get("boost") != null) { - mlt.setBoost(localParams.getBool("boost", false)); - boostFields = SolrPluginUtils.parseFieldBoosts(qf); - } + String[] fieldNames; - ArrayList fields = new ArrayList<>(); - if (qf != null) { + ArrayList fields = new ArrayList<>(); for (String fieldName : qf) { if (!StringUtils.isEmpty(fieldName)) { String[] strings = splitList.split(fieldName); @@ -96,26 +93,31 @@ public class SimpleMLTQParser extends QParser { } } } + // Parse field names and boosts from the fields + boostFields = SolrPluginUtils.parseFieldBoosts(fields.toArray(new String[0])); + fieldNames = boostFields.keySet().toArray(new String[0]); } else { - Map fieldNames = req.getSearcher().getSchema().getFields(); - for (String fieldName : fieldNames.keySet()) { - if (fieldNames.get(fieldName).indexed() && fieldNames.get(fieldName).stored()) - if (fieldNames.get(fieldName).getType().getNumericType() == null) + Map fieldDefinitions = req.getSearcher().getSchema().getFields(); + ArrayList fields = new ArrayList(); + for (String fieldName : fieldDefinitions.keySet()) { + if (fieldDefinitions.get(fieldName).indexed() && fieldDefinitions.get(fieldName).stored()) + if (fieldDefinitions.get(fieldName).getType().getNumericType() == null) fields.add(fieldName); } + fieldNames = fields.toArray(new String[0]); } - if( fields.size() < 1 ) { + if (fieldNames.length < 1) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "MoreLikeThis requires at least one similarity field: qf" ); } - mlt.setFieldNames(fields.toArray(new String[fields.size()])); + mlt.setFieldNames(fieldNames); mlt.setAnalyzer(req.getSchema().getIndexAnalyzer()); Query rawMLTQuery = mlt.like(scoreDocs[0].doc); BooleanQuery boostedMLTQuery = (BooleanQuery) rawMLTQuery; - if (boostFields.size() > 0) { + if (boost && boostFields.size() > 0) { BooleanQuery.Builder newQ = new BooleanQuery.Builder(); newQ.setMinimumNumberShouldMatch(boostedMLTQuery.getMinimumNumberShouldMatch()); diff --git a/solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQParserTest.java b/solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQParserTest.java index ffcde2f5704..e3a8d7b2d64 100644 --- a/solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQParserTest.java +++ b/solr/core/src/test/org/apache/solr/search/mlt/CloudMLTQParserTest.java @@ -121,6 +121,27 @@ public class CloudMLTQParserTest extends SolrCloudTestCase { } assertArrayEquals(expectedIds, actualIds); + queryResponse = cluster.getSolrClient().query(COLLECTION, new SolrQuery("{!mlt qf=lowerfilt_u^10,lowerfilt1_u^1000 boost=false mintf=0 mindf=0}30")); + solrDocuments = queryResponse.getResults(); + expectedIds = new int[]{31, 18, 23, 13, 14, 20, 22, 32, 19, 21}; + actualIds = new int[solrDocuments.size()]; + i = 0; + for (SolrDocument solrDocument : solrDocuments) { + actualIds[i++] = Integer.valueOf(String.valueOf(solrDocument.getFieldValue("id"))); + } + System.out.println("DEBUG ACTUAL IDS 1: " + Arrays.toString(actualIds)); + assertArrayEquals(expectedIds, actualIds); + + queryResponse = cluster.getSolrClient().query(COLLECTION, new SolrQuery("{!mlt qf=lowerfilt_u^10,lowerfilt1_u^1000 boost=true mintf=0 mindf=0}30")); + solrDocuments = queryResponse.getResults(); + expectedIds = new int[]{29, 31, 32, 18, 23, 13, 14, 20, 22, 19}; + actualIds = new int[solrDocuments.size()]; + i = 0; + for (SolrDocument solrDocument : solrDocuments) { + actualIds[i++] = Integer.valueOf(String.valueOf(solrDocument.getFieldValue("id"))); + } + System.out.println("DEBUG ACTUAL IDS 2: " + Arrays.toString(actualIds)); + assertArrayEquals(expectedIds, actualIds); } @Test @@ -220,7 +241,7 @@ public class CloudMLTQParserTest extends SolrCloudTestCase { } assertArrayEquals(expectedIds, actualIds); } - + public void testInvalidSourceDocument() throws IOException { SolrException e = expectThrows(SolrException.class, () -> { cluster.getSolrClient().query(COLLECTION, new SolrQuery("{!mlt qf=lowerfilt_u}999999")); diff --git a/solr/core/src/test/org/apache/solr/search/mlt/SimpleMLTQParserTest.java b/solr/core/src/test/org/apache/solr/search/mlt/SimpleMLTQParserTest.java index 6f3570f6cba..026c5942f16 100644 --- a/solr/core/src/test/org/apache/solr/search/mlt/SimpleMLTQParserTest.java +++ b/solr/core/src/test/org/apache/solr/search/mlt/SimpleMLTQParserTest.java @@ -107,9 +107,38 @@ public class SimpleMLTQParserTest extends SolrTestCaseJ4 { "//result/doc[10]/int[@name='id'][.='23']" ); + params = new ModifiableSolrParams(); + params.set(CommonParams.Q, "{!mlt qf=lowerfilt,lowerfilt1^1000 boost=false mintf=0 mindf=0}30"); + assertQ(req(params), + "//result/doc[1]/int[@name='id'][.='31']", + "//result/doc[2]/int[@name='id'][.='13']", + "//result/doc[3]/int[@name='id'][.='14']", + "//result/doc[4]/int[@name='id'][.='18']", + "//result/doc[5]/int[@name='id'][.='20']", + "//result/doc[6]/int[@name='id'][.='22']", + "//result/doc[7]/int[@name='id'][.='23']", + "//result/doc[8]/int[@name='id'][.='32']", + "//result/doc[9]/int[@name='id'][.='15']", + "//result/doc[10]/int[@name='id'][.='16']" + ); + + params = new ModifiableSolrParams(); + params.set(CommonParams.Q, "{!mlt qf=lowerfilt,lowerfilt1^1000 boost=true mintf=0 mindf=0}30"); + assertQ(req(params), + "//result/doc[1]/int[@name='id'][.='29']", + "//result/doc[2]/int[@name='id'][.='31']", + "//result/doc[3]/int[@name='id'][.='32']", + "//result/doc[4]/int[@name='id'][.='13']", + "//result/doc[5]/int[@name='id'][.='14']", + "//result/doc[6]/int[@name='id'][.='18']", + "//result/doc[7]/int[@name='id'][.='20']", + "//result/doc[8]/int[@name='id'][.='22']", + "//result/doc[9]/int[@name='id'][.='23']", + "//result/doc[10]/int[@name='id'][.='15']" + ); + params = new ModifiableSolrParams(); params.set(CommonParams.Q, "{!mlt qf=lowerfilt mindf=0 mintf=1}26"); - params.set(CommonParams.DEBUG, "true"); assertQ(req(params), "//result/doc[1]/int[@name='id'][.='29']", "//result/doc[2]/int[@name='id'][.='27']", @@ -118,14 +147,12 @@ public class SimpleMLTQParserTest extends SolrTestCaseJ4 { params = new ModifiableSolrParams(); params.set(CommonParams.Q, "{!mlt qf=lowerfilt mindf=10 mintf=1}26"); - params.set(CommonParams.DEBUG, "true"); assertQ(req(params), "//result[@numFound='0']" ); params = new ModifiableSolrParams(); params.set(CommonParams.Q, "{!mlt qf=lowerfilt minwl=3 mintf=1 mindf=1}26"); - params.set(CommonParams.DEBUG, "true"); assertQ(req(params), "//result[@numFound='3']" );