From 8d5fff37ae87d58b849490843accfb2ae2425da0 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 16 Jul 2015 18:49:35 +0200 Subject: [PATCH] `multi_match` query applies boosts too many times. The `multi_match` query groups terms that have the same analyzer together and then applies the boost of the first query in each group. This is not necessary given that boosts for each term are already applied another way. --- .../lucene/queries/BlendedTermQuery.java | 33 +++++++++++++++---- .../index/search/MultiMatchQuery.java | 6 ++-- .../query/SimpleIndexQueryParserTests.java | 20 +++++++++++ .../query-dsl/multi-match-query.asciidoc | 15 ++++++--- 4 files changed, 60 insertions(+), 14 deletions(-) diff --git a/core/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java b/core/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java index dd68ba16983..17485d024ae 100644 --- a/core/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java +++ b/core/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java @@ -33,6 +33,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.InPlaceMergeSorter; +import org.apache.lucene.util.ToStringUtils; import java.io.IOException; import java.util.Arrays; @@ -62,13 +63,17 @@ import java.util.List; public abstract class BlendedTermQuery extends Query { private final Term[] terms; + private final float[] boosts; - - public BlendedTermQuery(Term[] terms) { + public BlendedTermQuery(Term[] terms, float[] boosts) { if (terms == null || terms.length == 0) { throw new IllegalArgumentException("terms must not be null or empty"); } + if (boosts != null && boosts.length != terms.length) { + throw new IllegalArgumentException("boosts must have the same size as terms"); + } this.terms = terms; + this.boosts = boosts; } @Override @@ -231,8 +236,22 @@ public abstract class BlendedTermQuery extends Query { @Override public String toString(String field) { - return "blended(terms: " + Arrays.toString(terms) + ")"; - + StringBuilder builder = new StringBuilder("blended(terms:["); + for (int i = 0; i < terms.length; ++i) { + builder.append(terms[i]); + float boost = 1f; + if (boosts != null) { + boost = boosts[i]; + } + builder.append(ToStringUtils.boost(boost)); + builder.append(", "); + } + if (terms.length > 0) { + builder.setLength(builder.length() - 2); + } + builder.append("])"); + builder.append(ToStringUtils.boost(getBoost())); + return builder.toString(); } private volatile Term[] equalTerms = null; @@ -277,7 +296,7 @@ public abstract class BlendedTermQuery extends Query { } public static BlendedTermQuery booleanBlendedQuery(Term[] terms, final float[] boosts, final boolean disableCoord) { - return new BlendedTermQuery(terms) { + return new BlendedTermQuery(terms, boosts) { @Override protected Query topLevelQuery(Term[] terms, TermContext[] ctx, int[] docFreqs, int maxDoc) { BooleanQuery query = new BooleanQuery(disableCoord); @@ -294,7 +313,7 @@ public abstract class BlendedTermQuery extends Query { } public static BlendedTermQuery commonTermsBlendedQuery(Term[] terms, final float[] boosts, final boolean disableCoord, final float maxTermFrequency) { - return new BlendedTermQuery(terms) { + return new BlendedTermQuery(terms, boosts) { @Override protected Query topLevelQuery(Term[] terms, TermContext[] ctx, int[] docFreqs, int maxDoc) { BooleanQuery query = new BooleanQuery(true); @@ -334,7 +353,7 @@ public abstract class BlendedTermQuery extends Query { } public static BlendedTermQuery dismaxBlendedQuery(Term[] terms, final float[] boosts, final float tieBreakerMultiplier) { - return new BlendedTermQuery(terms) { + return new BlendedTermQuery(terms, boosts) { @Override protected Query topLevelQuery(Term[] terms, TermContext[] ctx, int[] docFreqs, int maxDoc) { DisjunctionMaxQuery query = new DisjunctionMaxQuery(tieBreakerMultiplier); diff --git a/core/src/main/java/org/elasticsearch/index/search/MultiMatchQuery.java b/core/src/main/java/org/elasticsearch/index/search/MultiMatchQuery.java index 878b0502972..621e7d0afca 100644 --- a/core/src/main/java/org/elasticsearch/index/search/MultiMatchQuery.java +++ b/core/src/main/java/org/elasticsearch/index/search/MultiMatchQuery.java @@ -85,7 +85,7 @@ public class MultiMatchQuery extends MatchQuery { throw new IllegalStateException("No such type: " + type); } final List queries = queryBuilder.buildGroupedQueries(type, fieldNames, value, minimumShouldMatch); - return queryBuilder.conbineGrouped(queries); + return queryBuilder.combineGrouped(queries); } private QueryBuilder queryBuilder; @@ -119,7 +119,7 @@ public class MultiMatchQuery extends MatchQuery { return parseAndApply(type, field, value, minimumShouldMatch, boostValue); } - public Query conbineGrouped(List groupQuery) { + public Query combineGrouped(List groupQuery) { if (groupQuery == null || groupQuery.isEmpty()) { return null; } @@ -196,7 +196,7 @@ public class MultiMatchQuery extends MatchQuery { blendedFields = null; } final FieldAndFieldType fieldAndFieldType = group.get(0); - Query q = parseGroup(type.matchQueryType(), fieldAndFieldType.field, fieldAndFieldType.boost, value, minimumShouldMatch); + Query q = parseGroup(type.matchQueryType(), fieldAndFieldType.field, 1f, value, minimumShouldMatch); if (q != null) { queries.add(q); } diff --git a/core/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java b/core/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java index b164c448b8a..5381c92a3f9 100644 --- a/core/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java +++ b/core/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java @@ -54,6 +54,7 @@ import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.common.xcontent.XContentHelper; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.index.IndexService; +import org.elasticsearch.index.engine.Engine; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.ParsedDocument; import org.elasticsearch.index.mapper.core.NumberFieldMapper; @@ -83,6 +84,7 @@ import static org.hamcrest.Matchers.*; public class SimpleIndexQueryParserTests extends ESSingleNodeTestCase { private IndexQueryParserService queryParser; + private IndexService indexService; @Before public void setup() throws IOException { @@ -99,6 +101,7 @@ public class SimpleIndexQueryParserTests extends ESSingleNodeTestCase { assertNotNull(doc.dynamicMappingsUpdate()); client().admin().indices().preparePutMapping("test").setType("person").setSource(doc.dynamicMappingsUpdate().toString()).get(); + this.indexService = indexService; queryParser = indexService.queryParserService(); } @@ -2269,6 +2272,23 @@ public class SimpleIndexQueryParserTests extends ESSingleNodeTestCase { assertThat(parsedQuery, instanceOf(BooleanQuery.class)); } + public void testCrossFieldMultiMatchQuery() throws IOException { + IndexQueryParserService queryParser = queryParser(); + Query parsedQuery = queryParser.parse(multiMatchQuery("banon", "name.first^2", "name.last^3", "foobar").type(MultiMatchQueryBuilder.Type.CROSS_FIELDS)).query(); + try (Engine.Searcher searcher = indexService.shardSafe(0).acquireSearcher("test")) { + Query rewrittenQuery = searcher.searcher().rewrite(parsedQuery); + + BooleanQuery expected = new BooleanQuery(); + expected.add(new TermQuery(new Term("foobar", "banon")), Occur.SHOULD); + TermQuery tq1 = new TermQuery(new Term("name.first", "banon")); + tq1.setBoost(2); + TermQuery tq2 = new TermQuery(new Term("name.last", "banon")); + tq2.setBoost(3); + expected.add(new DisjunctionMaxQuery(Arrays.asList(tq1, tq2), 0f), Occur.SHOULD); + assertEquals(expected, rewrittenQuery); + } + } + @Test public void testSimpleQueryString() throws Exception { IndexQueryParserService queryParser = queryParser(); diff --git a/docs/reference/query-dsl/multi-match-query.asciidoc b/docs/reference/query-dsl/multi-match-query.asciidoc index ecfaad76d6d..fe1a5f6b974 100644 --- a/docs/reference/query-dsl/multi-match-query.asciidoc +++ b/docs/reference/query-dsl/multi-match-query.asciidoc @@ -302,10 +302,17 @@ document to match. (Compare this to That solves one of the two problems. The problem of differing term frequencies is solved by _blending_ the term frequencies for all fields in order to even -out the differences. In other words, `first_name:smith` will be treated as -though it has the same weight as `last_name:smith`. (Actually, -`last_name:smith` is given a tiny advantage over `first_name:smith`, just to -make the order of results more stable.) +out the differences. + +In practice, `first_name:smith` will be treated as though it has the same +frequencies as `last_name:smith`, plus one. This will make matches on +`first_name` and `last_name` have comparable scores, with a tiny advantage +for `last_name` since it is the most likely field that contains `smith`. + +Note that `cross_fields` is usually only useful on short string fields +that all have a `boost` of `1`. Otherwise boosts, term freqs and length +normalization contribute to the score in such a way that the blending of term +statistics is not meaningful anymore. If you run the above query through the <>, it returns this explanation: