From 35cba50fce90d1c740cc7c234a21578dd1997b94 Mon Sep 17 00:00:00 2001 From: Alex Ksikes Date: Tue, 3 Jun 2014 17:52:23 +0200 Subject: [PATCH] More Like This Query: creates only one MLT query per field for all queried items. Previously, one MLT query per field was created for each item. One issue with this method is that the maximum number of selected terms was equal to the number of items times 'max_query_terms'. Instead, users should have direct control over the maximum number of selected terms allowed, regardless of the number of queried items. Another issue related to the previous method is that it could lead to the selection of rather uninteresting terms, that because they were found in a particular queried item. Instead, this new procedure enforces the selection of interesting terms across ALL items, not within each item. This could lead to search results where the best matching items share commonalities amongst the best characteristics of all the items. Closes #6404 --- .../lucene/search/MoreLikeThisQuery.java | 8 ++++++- .../index/query/MoreLikeThisQueryParser.java | 18 +++++++++++++++- .../query/SimpleIndexQueryParserTests.java | 21 ++++++++----------- .../query/{mlt-ids.json => mlt-items.json} | 0 4 files changed, 33 insertions(+), 14 deletions(-) rename src/test/java/org/elasticsearch/index/query/{mlt-ids.json => mlt-items.json} (100%) diff --git a/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java b/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java index 32110220fcb..bd0909538fc 100644 --- a/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java +++ b/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java @@ -27,11 +27,13 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.similarities.TFIDFSimilarity; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.io.FastStringReader; import java.io.IOException; import java.io.Reader; import java.util.Arrays; +import java.util.List; import java.util.Set; /** @@ -174,13 +176,17 @@ public class MoreLikeThisQuery extends Query { } public void setLikeText(String likeText) { - this.likeText = new String[]{likeText}; + setLikeText(new String[]{likeText}); } public void setLikeText(String... likeText) { this.likeText = likeText; } + public void setLikeText(List likeText) { + setLikeText(likeText.toArray(Strings.EMPTY_ARRAY)); + } + public String[] getMoreLikeFields() { return moreLikeFields; } diff --git a/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java b/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java index 1249de3b0c5..b67a1a4479c 100644 --- a/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java +++ b/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java @@ -20,6 +20,7 @@ package org.elasticsearch.index.query; import com.google.common.collect.Lists; +import com.google.common.collect.ObjectArrays; import com.google.common.collect.Sets; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.queries.TermsFilter; @@ -207,9 +208,11 @@ public class MoreLikeThisQueryParser implements QueryParser { } // fetching the items with multi-get List likeTexts = fetchService.fetch(items); + // collapse the text onto the same field name + Collection likeTextsCollapsed = collapseTextOnField(likeTexts); // right now we are just building a boolean query BooleanQuery boolQuery = new BooleanQuery(); - for (LikeText likeText : likeTexts) { + for (LikeText likeText : likeTextsCollapsed) { addMoreLikeThis(boolQuery, mltQuery, likeText); } // exclude the items from the search @@ -260,6 +263,19 @@ public class MoreLikeThisQueryParser implements QueryParser { return moreLikeFields; } + public static Collection collapseTextOnField (Collection likeTexts) { + Map collapsedTexts = new HashMap<>(); + for (LikeText likeText : likeTexts) { + String field = likeText.field; + String[] text = likeText.text; + if (collapsedTexts.containsKey(field)) { + text = ObjectArrays.concat(collapsedTexts.get(field).text, text, String.class); + } + collapsedTexts.put(field, new LikeText(field, text)); + } + return collapsedTexts.values(); + } + private void removeUnsupportedFields(MultiGetRequest.Item item, Analyzer analyzer, boolean failOnUnsupportedField) throws IOException { item.fields((String[]) removeUnsupportedFields(Arrays.asList(item.fields()), analyzer, failOnUnsupportedField).toArray()); } diff --git a/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java b/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java index 7127b6c3b1d..28d3226f380 100644 --- a/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java +++ b/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java @@ -33,6 +33,7 @@ import org.elasticsearch.ElasticsearchException; import org.elasticsearch.action.get.MultiGetRequest; import org.elasticsearch.cache.recycler.CacheRecyclerModule; import org.elasticsearch.cluster.ClusterService; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.compress.CompressedString; import org.elasticsearch.common.inject.AbstractModule; @@ -65,6 +66,7 @@ import org.elasticsearch.index.search.geo.GeoDistanceFilter; import org.elasticsearch.index.search.geo.GeoPolygonFilter; import org.elasticsearch.index.search.geo.InMemoryGeoBoundingBoxFilter; import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService; +import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService.LikeText; import org.elasticsearch.index.settings.IndexSettingsModule; import org.elasticsearch.index.similarity.SimilarityModule; import org.elasticsearch.indices.fielddata.breaker.CircuitBreakerService; @@ -1680,19 +1682,14 @@ public class SimpleIndexQueryParserTests extends ElasticsearchTestCase { MoreLikeThisQueryParser parser = (MoreLikeThisQueryParser) queryParser.queryParser("more_like_this"); parser.setFetchService(new MockMoreLikeThisFetchService()); - List likeTexts = new ArrayList<>(); - String index = "test"; - String type = "person"; - for (int i = 1; i < 5; i++) { - for (String field : new String[]{"name.first", "name.last"}) { - MoreLikeThisFetchService.LikeText likeText = new MoreLikeThisFetchService.LikeText( - field, index + " " + type + " " + i + " " + field); - likeTexts.add(likeText); - } - } + List likeTexts = new ArrayList<>(); + likeTexts.add(new LikeText("name.first", new String[]{ + "test person 1 name.first", "test person 2 name.first", "test person 3 name.first", "test person 4 name.first"})); + likeTexts.add(new LikeText("name.last", new String[]{ + "test person 1 name.last", "test person 2 name.last", "test person 3 name.last", "test person 4 name.last"})); IndexQueryParserService queryParser = queryParser(); - String query = copyToStringFromClasspath("/org/elasticsearch/index/query/mlt-ids.json"); + String query = copyToStringFromClasspath("/org/elasticsearch/index/query/mlt-items.json"); Query parsedQuery = queryParser.parse(query).query(); assertThat(parsedQuery, instanceOf(BooleanQuery.class)); BooleanQuery booleanQuery = (BooleanQuery) parsedQuery; @@ -1700,7 +1697,7 @@ public class SimpleIndexQueryParserTests extends ElasticsearchTestCase { // check each clause is for each item BooleanClause[] boolClauses = booleanQuery.getClauses(); - for (int i=0; i