SOLR-7143: MoreLikeThis Query parser now handles multiple field names

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1689531 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Anshum Gupta 2015-07-07 00:05:23 +00:00
parent de18625659
commit 0325ef2ae4
6 changed files with 167 additions and 40 deletions

View File

@ -753,9 +753,10 @@ public final class MoreLikeThis {
IOException {
HashMap<String,Int> termFreqMap = new HashMap();
for (String fieldName : fieldNames) {
for (String field : fields.keySet()) {
Collection<Object> fieldValues = fields.get(field);
if(fieldValues == null)
continue;
for(Object fieldValue:fieldValues) {
if (fieldValue != null) {
addTermFrequencies(new StringReader(String.valueOf(fieldValue)), termFreqMap,

View File

@ -209,6 +209,9 @@ Bug Fixes
* SOLR-7741: Add missing fields to SolrIndexerConfig.toMap
(Mike Drob, Christine Poerschke via Ramkumar Aiyengar)
* SOLR-7143: MoreLikeThis Query parser should handle multiple field names
(Jens Wille, Anshum Gupta)
Optimizations
----------------------

View File

@ -20,6 +20,7 @@ import org.apache.lucene.queries.mlt.MoreLikeThis;
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.StringUtils;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
@ -38,8 +39,11 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
public class CloudMLTQParser extends QParser {
// Pattern is thread safe -- TODO? share this with general 'fl' param
private static final Pattern splitList = Pattern.compile(",| ");
public CloudMLTQParser(String qstr, SolrParams localParams,
SolrParams params, SolrQueryRequest req) {
@ -86,14 +90,21 @@ public class CloudMLTQParser extends QParser {
String[] qf = localParams.getParams("qf");
Map<String, Collection<Object>> filteredDocument = new HashMap();
ArrayList<String> fieldNames = new ArrayList();
if (qf != null) {
mlt.setFieldNames(qf);
for (String field : qf) {
filteredDocument.put(field, doc.getFieldValues(field));
for (String fieldName : qf) {
if (!StringUtils.isEmpty(fieldName)) {
String[] strings = splitList.split(fieldName);
for (String string : strings) {
if (!StringUtils.isEmpty(string)) {
fieldNames.add(string);
}
}
}
}
} else {
Map<String, SchemaField> fields = req.getSchema().getFields();
ArrayList<String> fieldNames = new ArrayList();
for (String field : doc.getFieldNames()) {
// Only use fields that are stored and have an explicit analyzer.
// This makes sense as the query uses tf/idf/.. for query construction.
@ -101,10 +112,18 @@ public class CloudMLTQParser extends QParser {
if(fields.get(field).stored()
&& fields.get(field).getType().isExplicitAnalyzer()) {
fieldNames.add(field);
filteredDocument.put(field, doc.getFieldValues(field));
}
}
}
if( fieldNames.size() < 1 ) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,
"MoreLikeThis requires at least one similarity field: qf" );
}
mlt.setFieldNames(fieldNames.toArray(new String[fieldNames.size()]));
for (String field : fieldNames) {
filteredDocument.put(field, doc.getFieldValues(field));
}
try {

View File

@ -25,6 +25,7 @@ import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.NumericUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.StringUtils;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.SchemaField;
@ -35,8 +36,11 @@ import org.apache.solr.search.SolrIndexSearcher;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Map;
import java.util.regex.Pattern;
public class SimpleMLTQParser extends QParser {
// Pattern is thread safe -- TODO? share this with general 'fl' param
private static final Pattern splitList = Pattern.compile(",| ");
public SimpleMLTQParser(String qstr, SolrParams localParams,
SolrParams params, SolrQueryRequest req) {
@ -85,18 +89,30 @@ public class SimpleMLTQParser extends QParser {
ArrayList<String> fields = new ArrayList();
if (qf != null) {
mlt.setFieldNames(qf);
for (String fieldName : qf) {
if (!StringUtils.isEmpty(fieldName)) {
String[] strings = splitList.split(fieldName);
for (String string : strings) {
if (!StringUtils.isEmpty(string)) {
fields.add(string);
}
}
}
}
} else {
Map<String, SchemaField> fieldNames = req.getSearcher().getSchema().getFields();
for (String fieldName : fieldNames.keySet()) {
if (fieldNames.get(fieldName).indexed() && fieldNames.get(fieldName).stored())
if (fieldNames.get(fieldName).getType().getNumericType() == null)
fields.add(fieldName);
}
mlt.setFieldNames(fields.toArray(new String[fields.size()]));
}
if( fields.size() < 1 ) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,
"MoreLikeThis requires at least one similarity field: qf" );
}
mlt.setFieldNames(fields.toArray(new String[fields.size()]));
mlt.setAnalyzer(req.getSchema().getIndexAnalyzer());
return mlt.like(scoreDocs[0].doc);

View File

@ -59,6 +59,7 @@ public class CloudMLTQParserTest extends AbstractFullDistribZkTestBase {
String id = "id";
delQ("*:*");
String FIELD1 = "lowerfilt" ;
String FIELD2 = "lowerfilt1" ;
indexDoc(sdoc(id, "1", FIELD1, "toyota"));
indexDoc(sdoc(id, "2", FIELD1, "chevrolet"));
@ -87,6 +88,14 @@ public class CloudMLTQParserTest extends AbstractFullDistribZkTestBase {
indexDoc(sdoc(id, "26", FIELD1, "bmw usa 328i"));
indexDoc(sdoc(id, "27", FIELD1, "bmw usa 535i"));
indexDoc(sdoc(id, "28", FIELD1, "bmw 750Li"));
indexDoc(sdoc(id, "29", FIELD1, "bmw usa",
FIELD2, "red green blue"));
indexDoc(sdoc(id, "30", FIELD1, "The quote red fox jumped over the lazy brown dogs.",
FIELD2, "red green yellow"));
indexDoc(sdoc(id, "31", FIELD1, "The fat red fox jumped over the lazy brown dogs.",
FIELD2, "green blue yellow"));
indexDoc(sdoc(id, "32", FIELD1, "The slim red fox jumped over the lazy brown dogs.",
FIELD2, "yellow white black"));
commit();
@ -100,7 +109,7 @@ public class CloudMLTQParserTest extends AbstractFullDistribZkTestBase {
params.set(CommonParams.Q, "{!mlt qf=lowerfilt}17");
QueryResponse queryResponse = cloudClient.query(params);
SolrDocumentList solrDocuments = queryResponse.getResults();
int[] expectedIds = new int[]{17, 7, 13, 14, 15, 16, 20, 22, 24, 9};
int[] expectedIds = new int[]{17, 7, 13, 14, 15, 16, 20, 22, 24, 32};
int[] actualIds = new int[10];
int i = 0;
for (SolrDocument solrDocument : solrDocuments) {
@ -113,7 +122,7 @@ public class CloudMLTQParserTest extends AbstractFullDistribZkTestBase {
params.set(CommonParams.DEBUG, "true");
queryResponse = queryServer(params);
solrDocuments = queryResponse.getResults();
expectedIds = new int[]{3, 27, 26, 28};
expectedIds = new int[]{3, 29, 27, 26, 28};
actualIds = new int[solrDocuments.size()];
i = 0;
for (SolrDocument solrDocument : solrDocuments) {
@ -139,6 +148,36 @@ public class CloudMLTQParserTest extends AbstractFullDistribZkTestBase {
actualParsedQueries.get(counter)));
}
params = new ModifiableSolrParams();
params.set(CommonParams.Q, "{!mlt qf=lowerfilt,lowerfilt1 mindf=0 mintf=1}26");
params.set(CommonParams.DEBUG, "true");
queryResponse = queryServer(params);
solrDocuments = queryResponse.getResults();
expectedIds = new int[]{26, 27, 3, 29, 28};
actualIds = new int[solrDocuments.size()];
i = 0;
for (SolrDocument solrDocument : solrDocuments) {
actualIds[i++] = Integer.valueOf(String.valueOf(solrDocument.getFieldValue("id")));
}
assertArrayEquals(expectedIds, actualIds);
expectedQueryString = "lowerfilt:bmw lowerfilt:usa lowerfilt:328i";
if(queryResponse.getDebugMap().get("parsedquery") instanceof String) {
actualParsedQueries = new ArrayList();
actualParsedQueries.add((String) queryResponse.getDebugMap().get("parsedquery"));
} else {
actualParsedQueries = (ArrayList<String>) queryResponse
.getDebugMap().get("parsedquery");
}
for (int counter = 0; counter < actualParsedQueries.size(); counter++) {
assertTrue("Parsed queries aren't equal",
compareParsedQueryStrings(expectedQueryString,
actualParsedQueries.get(counter)));
}
params = new ModifiableSolrParams();
// Test out a high value of df and make sure nothing matches.
params.set(CommonParams.Q, "{!mlt qf=lowerfilt mindf=20 mintf=1}3");
@ -161,7 +200,7 @@ public class CloudMLTQParserTest extends AbstractFullDistribZkTestBase {
params.set(CommonParams.DEBUG, "true");
queryResponse = queryServer(params);
solrDocuments = queryResponse.getResults();
assertEquals("Expected to match 4 documents with a minwl of 3 but found more", solrDocuments.size(), 4);
assertEquals("Expected to match 4 documents with a minwl of 3 but found more", 5, solrDocuments.size());
// Assert that {!mlt}id does not throw an exception i.e. implicitly, only fields that are stored + have explicit
// analyzer are used for MLT Query construction.
@ -171,10 +210,12 @@ public class CloudMLTQParserTest extends AbstractFullDistribZkTestBase {
queryResponse = queryServer(params);
solrDocuments = queryResponse.getResults();
actualIds = new int[solrDocuments.size()];
expectedIds = new int[]{13, 14, 15, 16, 20, 22, 24, 18, 19, 21};
expectedIds = new int[]{13, 14, 15, 16, 20, 22, 24, 32, 18, 19};
i = 0;
StringBuilder sb = new StringBuilder();
for (SolrDocument solrDocument : solrDocuments) {
actualIds[i++] = Integer.valueOf(String.valueOf(solrDocument.getFieldValue("id")));
sb.append(actualIds[i-1]).append(", ");
}
assertArrayEquals(expectedIds, actualIds);
}

View File

@ -17,9 +17,13 @@ package org.apache.solr.search.mlt;
* limitations under the License.
*/
import java.util.ArrayList;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.response.SolrQueryResponse;
import org.junit.BeforeClass;
import org.junit.Test;
@ -34,38 +38,50 @@ public class SimpleMLTQParserTest extends SolrTestCaseJ4 {
@Test
public void doTest() throws Exception {
String id = "id";
String FIELD1 = "lowerfilt" ;
String FIELD2 = "lowerfilt1" ;
delQ("*:*");
assertU(adoc(id, "1", "lowerfilt", "toyota"));
assertU(adoc(id, "2", "lowerfilt", "chevrolet"));
assertU(adoc(id, "3", "lowerfilt", "suzuki"));
assertU(adoc(id, "4", "lowerfilt", "ford"));
assertU(adoc(id, "5", "lowerfilt", "ferrari"));
assertU(adoc(id, "6", "lowerfilt", "jaguar"));
assertU(adoc(id, "7", "lowerfilt", "mclaren moon or the moon and moon moon shine " +
assertU(adoc(id, "1", FIELD1, "toyota"));
assertU(adoc(id, "2", FIELD1, "chevrolet"));
assertU(adoc(id, "3", FIELD1, "suzuki"));
assertU(adoc(id, "4", FIELD1, "ford"));
assertU(adoc(id, "5", FIELD1, "ferrari"));
assertU(adoc(id, "6", FIELD1, "jaguar"));
assertU(adoc(id, "7", FIELD1, "mclaren moon or the moon and moon moon shine " +
"and the moon but moon was good foxes too"));
assertU(adoc(id, "8", "lowerfilt", "sonata"));
assertU(adoc(id, "9", "lowerfilt", "The quick red fox jumped over the lazy big " +
assertU(adoc(id, "8", FIELD1, "sonata"));
assertU(adoc(id, "9", FIELD1, "The quick red fox jumped over the lazy big " +
"and large brown dogs."));
assertU(adoc(id, "10", "lowerfilt", "blue"));
assertU(adoc(id, "12", "lowerfilt", "glue"));
assertU(adoc(id, "13", "lowerfilt", "The quote red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "14", "lowerfilt", "The quote red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "15", "lowerfilt", "The fat red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "16", "lowerfilt", "The slim red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "17", "lowerfilt", "The quote red fox jumped moon over the lazy " +
assertU(adoc(id, "10", FIELD1, "blue"));
assertU(adoc(id, "12", FIELD1, "glue"));
assertU(adoc(id, "13", FIELD1, "The quote red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "14", FIELD1, "The quote red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "15", FIELD1, "The fat red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "16", FIELD1, "The slim red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "17", FIELD1, "The quote red fox jumped moon over the lazy " +
"brown dogs moon. Of course moon. Foxes and moon come back to the foxes and moon"));
assertU(adoc(id, "18", "lowerfilt", "The quote red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "19", "lowerfilt", "The hose red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "20", "lowerfilt", "The quote red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "21", "lowerfilt", "The court red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "22", "lowerfilt", "The quote red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "23", "lowerfilt", "The quote red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "24", "lowerfilt", "The file red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "25", "lowerfilt", "rod fix"));
assertU(adoc(id, "18", FIELD1, "The quote red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "19", FIELD1, "The hose red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "20", FIELD1, "The quote red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "21", FIELD1, "The court red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "22", FIELD1, "The quote red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "23", FIELD1, "The quote red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "24", FIELD1, "The file red fox jumped over the lazy brown dogs."));
assertU(adoc(id, "25", FIELD1, "rod fix"));
assertU(adoc(id, "26", FIELD1, "bmw usa 328i"));
assertU(adoc(id, "27", FIELD1, "bmw usa 535i"));
assertU(adoc(id, "28", FIELD1, "bmw 750Li"));
assertU(adoc(id, "29", FIELD1, "bmw usa",
FIELD2, "red green blue"));
assertU(adoc(id, "30", FIELD1, "The quote red fox jumped over the lazy brown dogs.",
FIELD2, "red green yellow"));
assertU(adoc(id, "31", FIELD1, "The fat red fox jumped over the lazy brown dogs.",
FIELD2, "green blue yellow"));
assertU(adoc(id, "32", FIELD1, "The slim red fox jumped over the lazy brown dogs.",
FIELD2, "yellow white black"));
assertU(commit());
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(CommonParams.Q, "{!mlt qf=lowerfilt}17");
assertQ(req(params),
@ -80,6 +96,37 @@ public class SimpleMLTQParserTest extends SolrTestCaseJ4 {
"//result/doc[9]/int[@name='id'][.='22']",
"//result/doc[10]/int[@name='id'][.='23']"
);
params = new ModifiableSolrParams();
params.set(CommonParams.Q, "{!mlt qf=lowerfilt mindf=0 mintf=1}26");
params.set(CommonParams.DEBUG, "true");
assertQ(req(params),
"//result/doc[1]/int[@name='id'][.='26']",
"//result/doc[2]/int[@name='id'][.='29']",
"//result/doc[3]/int[@name='id'][.='27']",
"//result/doc[4]/int[@name='id'][.='28']"
);
params = new ModifiableSolrParams();
params.set(CommonParams.Q, "{!mlt qf=lowerfilt mindf=10 mintf=1}26");
params.set(CommonParams.DEBUG, "true");
assertQ(req(params),
"//result[@numFound='0']"
);
params = new ModifiableSolrParams();
params.set(CommonParams.Q, "{!mlt qf=lowerfilt minwl=3 mintf=1 mindf=1}26");
params.set(CommonParams.DEBUG, "true");
assertQ(req(params),
"//result[@numFound='4']"
);
params = new ModifiableSolrParams();
params.set(CommonParams.Q, "{!mlt qf=lowerfilt minwl=4 mintf=1 mindf=1}26");
params.set(CommonParams.DEBUG, "true");
assertQ(req(params),
"//result[@numFound='1']"
);
}
}