SOLR-9644: Fixed SimpleMLTQParser and CloudMLTQParser to handle boosts properly and CloudMLTQParser to only extract actual values from IndexableField type fields to the filtered document.

This commit is contained in:
anshum 2017-01-09 13:05:21 -08:00
parent b8383db06e
commit 2b4e3dd941
5 changed files with 102 additions and 37 deletions

View File

@ -345,6 +345,10 @@ Bug Fixes
* SOLR-9883: Example schemaless solr config files can lead to invalid tlog replays: when updates are buffered,
update processors ordered before DistributedUpdateProcessor, e.g. field normalization, are never run. (Steve Rowe)
* SOLR-9644: SimpleMLTQParser and CloudMLTQParser did not handle field boosts properly
and CloudMLTQParser included extra strings from the field definitions in the query.
(Ere Maijala via Anshum Gupta)
Other Changes
----------------------

View File

@ -22,6 +22,7 @@ import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.legacy.LegacyNumericUtils;
import org.apache.lucene.queries.mlt.MoreLikeThis;
@ -64,73 +65,83 @@ public class CloudMLTQParser extends QParser {
SolrException.ErrorCode.BAD_REQUEST, "Error completing MLT request. Could not fetch " +
"document with id [" + id + "]");
}
String[] qf = localParams.getParams("qf");
Map<String,Float> boostFields = new HashMap<>();
MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader());
mlt.setMinTermFreq(localParams.getInt("mintf", MoreLikeThis.DEFAULT_MIN_TERM_FREQ));
mlt.setMinDocFreq(localParams.getInt("mindf", 0));
mlt.setMinWordLen(localParams.getInt("minwl", MoreLikeThis.DEFAULT_MIN_WORD_LENGTH));
mlt.setMaxWordLen(localParams.getInt("maxwl", MoreLikeThis.DEFAULT_MAX_WORD_LENGTH));
mlt.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThis.DEFAULT_MAX_QUERY_TERMS));
mlt.setMaxNumTokensParsed(localParams.getInt("maxntp", MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED));
mlt.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThis.DEFAULT_MAX_DOC_FREQ));
if(localParams.get("boost") != null) {
mlt.setBoost(localParams.getBool("boost"));
boostFields = SolrPluginUtils.parseFieldBoosts(qf);
}
Boolean boost = localParams.getBool("boost", MoreLikeThis.DEFAULT_BOOST);
mlt.setBoost(boost);
mlt.setAnalyzer(req.getSchema().getIndexAnalyzer());
Map<String, Collection<Object>> filteredDocument = new HashMap<>();
ArrayList<String> fieldNames = new ArrayList<>();
String[] fieldNames;
if (qf != null) {
ArrayList<String> fields = new ArrayList();
for (String fieldName : qf) {
if (!StringUtils.isEmpty(fieldName)) {
String[] strings = splitList.split(fieldName);
for (String string : strings) {
if (!StringUtils.isEmpty(string)) {
fieldNames.add(string);
fields.add(string);
}
}
}
}
// Parse field names and boosts from the fields
boostFields = SolrPluginUtils.parseFieldBoosts(fields.toArray(new String[0]));
fieldNames = boostFields.keySet().toArray(new String[0]);
} else {
ArrayList<String> fields = new ArrayList();
for (String field : doc.getFieldNames()) {
// Only use fields that are stored and have an explicit analyzer.
// This makes sense as the query uses tf/idf/.. for query construction.
// We might want to relook and change this in the future though.
SchemaField f = req.getSchema().getFieldOrNull(field);
if (f != null && f.stored() && f.getType().isExplicitAnalyzer()) {
fieldNames.add(field);
fields.add(field);
}
}
fieldNames = fields.toArray(new String[0]);
}
if( fieldNames.size() < 1 ) {
if (fieldNames.length < 1) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,
"MoreLikeThis requires at least one similarity field: qf" );
}
mlt.setFieldNames(fieldNames.toArray(new String[fieldNames.size()]));
mlt.setFieldNames(fieldNames);
for (String field : fieldNames) {
filteredDocument.put(field, doc.getFieldValues(field));
Collection<Object> fieldValues = doc.getFieldValues(field);
if (fieldValues != null) {
Collection<Object> values = new ArrayList();
for (Object val : fieldValues) {
if (val instanceof IndexableField) {
values.add(((IndexableField)val).stringValue());
}
else {
values.add(val);
}
}
filteredDocument.put(field, values);
}
}
try {
Query rawMLTQuery = mlt.like(filteredDocument);
BooleanQuery boostedMLTQuery = (BooleanQuery) rawMLTQuery;
if (boostFields.size() > 0) {
if (boost && boostFields.size() > 0) {
BooleanQuery.Builder newQ = new BooleanQuery.Builder();
newQ.setMinimumNumberShouldMatch(boostedMLTQuery.getMinimumNumberShouldMatch());

View File

@ -76,16 +76,13 @@ public class SimpleMLTQParser extends QParser {
mlt.setMaxQueryTerms(localParams.getInt("maxqt", MoreLikeThis.DEFAULT_MAX_QUERY_TERMS));
mlt.setMaxNumTokensParsed(localParams.getInt("maxntp", MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED));
mlt.setMaxDocFreq(localParams.getInt("maxdf", MoreLikeThis.DEFAULT_MAX_DOC_FREQ));
Boolean boost = localParams.getBool("boost", false);
mlt.setBoost(boost);
// what happens if value is explicitly set to false?
if(localParams.get("boost") != null) {
mlt.setBoost(localParams.getBool("boost", false));
boostFields = SolrPluginUtils.parseFieldBoosts(qf);
}
String[] fieldNames;
ArrayList<String> fields = new ArrayList<>();
if (qf != null) {
ArrayList<String> fields = new ArrayList<>();
for (String fieldName : qf) {
if (!StringUtils.isEmpty(fieldName)) {
String[] strings = splitList.split(fieldName);
@ -96,26 +93,31 @@ public class SimpleMLTQParser extends QParser {
}
}
}
// Parse field names and boosts from the fields
boostFields = SolrPluginUtils.parseFieldBoosts(fields.toArray(new String[0]));
fieldNames = boostFields.keySet().toArray(new String[0]);
} else {
Map<String, SchemaField> fieldNames = req.getSearcher().getSchema().getFields();
for (String fieldName : fieldNames.keySet()) {
if (fieldNames.get(fieldName).indexed() && fieldNames.get(fieldName).stored())
if (fieldNames.get(fieldName).getType().getNumericType() == null)
Map<String, SchemaField> fieldDefinitions = req.getSearcher().getSchema().getFields();
ArrayList<String> fields = new ArrayList();
for (String fieldName : fieldDefinitions.keySet()) {
if (fieldDefinitions.get(fieldName).indexed() && fieldDefinitions.get(fieldName).stored())
if (fieldDefinitions.get(fieldName).getType().getNumericType() == null)
fields.add(fieldName);
}
fieldNames = fields.toArray(new String[0]);
}
if( fields.size() < 1 ) {
if (fieldNames.length < 1) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,
"MoreLikeThis requires at least one similarity field: qf" );
}
mlt.setFieldNames(fields.toArray(new String[fields.size()]));
mlt.setFieldNames(fieldNames);
mlt.setAnalyzer(req.getSchema().getIndexAnalyzer());
Query rawMLTQuery = mlt.like(scoreDocs[0].doc);
BooleanQuery boostedMLTQuery = (BooleanQuery) rawMLTQuery;
if (boostFields.size() > 0) {
if (boost && boostFields.size() > 0) {
BooleanQuery.Builder newQ = new BooleanQuery.Builder();
newQ.setMinimumNumberShouldMatch(boostedMLTQuery.getMinimumNumberShouldMatch());

View File

@ -121,6 +121,27 @@ public class CloudMLTQParserTest extends SolrCloudTestCase {
}
assertArrayEquals(expectedIds, actualIds);
queryResponse = cluster.getSolrClient().query(COLLECTION, new SolrQuery("{!mlt qf=lowerfilt_u^10,lowerfilt1_u^1000 boost=false mintf=0 mindf=0}30"));
solrDocuments = queryResponse.getResults();
expectedIds = new int[]{31, 18, 23, 13, 14, 20, 22, 32, 19, 21};
actualIds = new int[solrDocuments.size()];
i = 0;
for (SolrDocument solrDocument : solrDocuments) {
actualIds[i++] = Integer.valueOf(String.valueOf(solrDocument.getFieldValue("id")));
}
System.out.println("DEBUG ACTUAL IDS 1: " + Arrays.toString(actualIds));
assertArrayEquals(expectedIds, actualIds);
queryResponse = cluster.getSolrClient().query(COLLECTION, new SolrQuery("{!mlt qf=lowerfilt_u^10,lowerfilt1_u^1000 boost=true mintf=0 mindf=0}30"));
solrDocuments = queryResponse.getResults();
expectedIds = new int[]{29, 31, 32, 18, 23, 13, 14, 20, 22, 19};
actualIds = new int[solrDocuments.size()];
i = 0;
for (SolrDocument solrDocument : solrDocuments) {
actualIds[i++] = Integer.valueOf(String.valueOf(solrDocument.getFieldValue("id")));
}
System.out.println("DEBUG ACTUAL IDS 2: " + Arrays.toString(actualIds));
assertArrayEquals(expectedIds, actualIds);
}
@Test
@ -220,7 +241,7 @@ public class CloudMLTQParserTest extends SolrCloudTestCase {
}
assertArrayEquals(expectedIds, actualIds);
}
public void testInvalidSourceDocument() throws IOException {
SolrException e = expectThrows(SolrException.class, () -> {
cluster.getSolrClient().query(COLLECTION, new SolrQuery("{!mlt qf=lowerfilt_u}999999"));

View File

@ -107,9 +107,38 @@ public class SimpleMLTQParserTest extends SolrTestCaseJ4 {
"//result/doc[10]/int[@name='id'][.='23']"
);
params = new ModifiableSolrParams();
params.set(CommonParams.Q, "{!mlt qf=lowerfilt,lowerfilt1^1000 boost=false mintf=0 mindf=0}30");
assertQ(req(params),
"//result/doc[1]/int[@name='id'][.='31']",
"//result/doc[2]/int[@name='id'][.='13']",
"//result/doc[3]/int[@name='id'][.='14']",
"//result/doc[4]/int[@name='id'][.='18']",
"//result/doc[5]/int[@name='id'][.='20']",
"//result/doc[6]/int[@name='id'][.='22']",
"//result/doc[7]/int[@name='id'][.='23']",
"//result/doc[8]/int[@name='id'][.='32']",
"//result/doc[9]/int[@name='id'][.='15']",
"//result/doc[10]/int[@name='id'][.='16']"
);
params = new ModifiableSolrParams();
params.set(CommonParams.Q, "{!mlt qf=lowerfilt,lowerfilt1^1000 boost=true mintf=0 mindf=0}30");
assertQ(req(params),
"//result/doc[1]/int[@name='id'][.='29']",
"//result/doc[2]/int[@name='id'][.='31']",
"//result/doc[3]/int[@name='id'][.='32']",
"//result/doc[4]/int[@name='id'][.='13']",
"//result/doc[5]/int[@name='id'][.='14']",
"//result/doc[6]/int[@name='id'][.='18']",
"//result/doc[7]/int[@name='id'][.='20']",
"//result/doc[8]/int[@name='id'][.='22']",
"//result/doc[9]/int[@name='id'][.='23']",
"//result/doc[10]/int[@name='id'][.='15']"
);
params = new ModifiableSolrParams();
params.set(CommonParams.Q, "{!mlt qf=lowerfilt mindf=0 mintf=1}26");
params.set(CommonParams.DEBUG, "true");
assertQ(req(params),
"//result/doc[1]/int[@name='id'][.='29']",
"//result/doc[2]/int[@name='id'][.='27']",
@ -118,14 +147,12 @@ public class SimpleMLTQParserTest extends SolrTestCaseJ4 {
params = new ModifiableSolrParams();
params.set(CommonParams.Q, "{!mlt qf=lowerfilt mindf=10 mintf=1}26");
params.set(CommonParams.DEBUG, "true");
assertQ(req(params),
"//result[@numFound='0']"
);
params = new ModifiableSolrParams();
params.set(CommonParams.Q, "{!mlt qf=lowerfilt minwl=3 mintf=1 mindf=1}26");
params.set(CommonParams.DEBUG, "true");
assertQ(req(params),
"//result[@numFound='3']"
);