More Like This Query: creates only one MLT query per field for all queried items.

Previously, one MLT query per field was created for each item. One issue with
this method is that the maximum number of selected terms was equal to the
number of items times 'max_query_terms'. Instead, users should have direct control
over the maximum number of selected terms allowed, regardless of the number of
queried items.

Another issue related to the previous method is that it could lead to the
selection of rather uninteresting terms, that because they were found in a
particular queried item. Instead, this new procedure enforces the selection of
interesting terms across ALL items, not within each item. This could lead to
search results where the best matching items share commonalities amongst the
best characteristics of all the items.

Closes #6404
This commit is contained in:
Alex Ksikes 2014-06-03 17:52:23 +02:00
parent c41e63c2f9
commit 35cba50fce
4 changed files with 33 additions and 14 deletions

View File

@ -27,11 +27,13 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.io.FastStringReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
/**
@ -174,13 +176,17 @@ public class MoreLikeThisQuery extends Query {
}
public void setLikeText(String likeText) {
this.likeText = new String[]{likeText};
setLikeText(new String[]{likeText});
}
public void setLikeText(String... likeText) {
this.likeText = likeText;
}
public void setLikeText(List<String> likeText) {
setLikeText(likeText.toArray(Strings.EMPTY_ARRAY));
}
public String[] getMoreLikeFields() {
return moreLikeFields;
}

View File

@ -20,6 +20,7 @@
package org.elasticsearch.index.query;
import com.google.common.collect.Lists;
import com.google.common.collect.ObjectArrays;
import com.google.common.collect.Sets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.queries.TermsFilter;
@ -207,9 +208,11 @@ public class MoreLikeThisQueryParser implements QueryParser {
}
// fetching the items with multi-get
List<LikeText> likeTexts = fetchService.fetch(items);
// collapse the text onto the same field name
Collection<LikeText> likeTextsCollapsed = collapseTextOnField(likeTexts);
// right now we are just building a boolean query
BooleanQuery boolQuery = new BooleanQuery();
for (LikeText likeText : likeTexts) {
for (LikeText likeText : likeTextsCollapsed) {
addMoreLikeThis(boolQuery, mltQuery, likeText);
}
// exclude the items from the search
@ -260,6 +263,19 @@ public class MoreLikeThisQueryParser implements QueryParser {
return moreLikeFields;
}
public static Collection<LikeText> collapseTextOnField (Collection<LikeText> likeTexts) {
Map<String, LikeText> collapsedTexts = new HashMap<>();
for (LikeText likeText : likeTexts) {
String field = likeText.field;
String[] text = likeText.text;
if (collapsedTexts.containsKey(field)) {
text = ObjectArrays.concat(collapsedTexts.get(field).text, text, String.class);
}
collapsedTexts.put(field, new LikeText(field, text));
}
return collapsedTexts.values();
}
private void removeUnsupportedFields(MultiGetRequest.Item item, Analyzer analyzer, boolean failOnUnsupportedField) throws IOException {
item.fields((String[]) removeUnsupportedFields(Arrays.asList(item.fields()), analyzer, failOnUnsupportedField).toArray());
}

View File

@ -33,6 +33,7 @@ import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.cache.recycler.CacheRecyclerModule;
import org.elasticsearch.cluster.ClusterService;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.compress.CompressedString;
import org.elasticsearch.common.inject.AbstractModule;
@ -65,6 +66,7 @@ import org.elasticsearch.index.search.geo.GeoDistanceFilter;
import org.elasticsearch.index.search.geo.GeoPolygonFilter;
import org.elasticsearch.index.search.geo.InMemoryGeoBoundingBoxFilter;
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService;
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService.LikeText;
import org.elasticsearch.index.settings.IndexSettingsModule;
import org.elasticsearch.index.similarity.SimilarityModule;
import org.elasticsearch.indices.fielddata.breaker.CircuitBreakerService;
@ -1680,19 +1682,14 @@ public class SimpleIndexQueryParserTests extends ElasticsearchTestCase {
MoreLikeThisQueryParser parser = (MoreLikeThisQueryParser) queryParser.queryParser("more_like_this");
parser.setFetchService(new MockMoreLikeThisFetchService());
List<MoreLikeThisFetchService.LikeText> likeTexts = new ArrayList<>();
String index = "test";
String type = "person";
for (int i = 1; i < 5; i++) {
for (String field : new String[]{"name.first", "name.last"}) {
MoreLikeThisFetchService.LikeText likeText = new MoreLikeThisFetchService.LikeText(
field, index + " " + type + " " + i + " " + field);
likeTexts.add(likeText);
}
}
List<LikeText> likeTexts = new ArrayList<>();
likeTexts.add(new LikeText("name.first", new String[]{
"test person 1 name.first", "test person 2 name.first", "test person 3 name.first", "test person 4 name.first"}));
likeTexts.add(new LikeText("name.last", new String[]{
"test person 1 name.last", "test person 2 name.last", "test person 3 name.last", "test person 4 name.last"}));
IndexQueryParserService queryParser = queryParser();
String query = copyToStringFromClasspath("/org/elasticsearch/index/query/mlt-ids.json");
String query = copyToStringFromClasspath("/org/elasticsearch/index/query/mlt-items.json");
Query parsedQuery = queryParser.parse(query).query();
assertThat(parsedQuery, instanceOf(BooleanQuery.class));
BooleanQuery booleanQuery = (BooleanQuery) parsedQuery;
@ -1700,7 +1697,7 @@ public class SimpleIndexQueryParserTests extends ElasticsearchTestCase {
// check each clause is for each item
BooleanClause[] boolClauses = booleanQuery.getClauses();
for (int i=0; i<likeTexts.size(); i++) {
for (int i = 0; i < likeTexts.size(); i++) {
BooleanClause booleanClause = booleanQuery.getClauses()[i];
assertThat(booleanClause.getOccur(), is(BooleanClause.Occur.SHOULD));
assertThat(booleanClause.getQuery(), instanceOf(MoreLikeThisQuery.class));