`multi_match` query applies boosts too many times.

The `multi_match` query groups terms that have the same analyzer together and
then applies the boost of the first query in each group. This is not necessary
given that boosts for each term are already applied another way.
This commit is contained in:
Adrien Grand 2015-07-16 18:49:35 +02:00
parent 88842f3319
commit 8d5fff37ae
4 changed files with 60 additions and 14 deletions

View File

@ -33,6 +33,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;
import java.util.Arrays;
@ -62,13 +63,17 @@ import java.util.List;
public abstract class BlendedTermQuery extends Query {
private final Term[] terms;
private final float[] boosts;
public BlendedTermQuery(Term[] terms) {
public BlendedTermQuery(Term[] terms, float[] boosts) {
if (terms == null || terms.length == 0) {
throw new IllegalArgumentException("terms must not be null or empty");
}
if (boosts != null && boosts.length != terms.length) {
throw new IllegalArgumentException("boosts must have the same size as terms");
}
this.terms = terms;
this.boosts = boosts;
}
@Override
@ -231,8 +236,22 @@ public abstract class BlendedTermQuery extends Query {
@Override
public String toString(String field) {
return "blended(terms: " + Arrays.toString(terms) + ")";
StringBuilder builder = new StringBuilder("blended(terms:[");
for (int i = 0; i < terms.length; ++i) {
builder.append(terms[i]);
float boost = 1f;
if (boosts != null) {
boost = boosts[i];
}
builder.append(ToStringUtils.boost(boost));
builder.append(", ");
}
if (terms.length > 0) {
builder.setLength(builder.length() - 2);
}
builder.append("])");
builder.append(ToStringUtils.boost(getBoost()));
return builder.toString();
}
private volatile Term[] equalTerms = null;
@ -277,7 +296,7 @@ public abstract class BlendedTermQuery extends Query {
}
public static BlendedTermQuery booleanBlendedQuery(Term[] terms, final float[] boosts, final boolean disableCoord) {
return new BlendedTermQuery(terms) {
return new BlendedTermQuery(terms, boosts) {
@Override
protected Query topLevelQuery(Term[] terms, TermContext[] ctx, int[] docFreqs, int maxDoc) {
BooleanQuery query = new BooleanQuery(disableCoord);
@ -294,7 +313,7 @@ public abstract class BlendedTermQuery extends Query {
}
public static BlendedTermQuery commonTermsBlendedQuery(Term[] terms, final float[] boosts, final boolean disableCoord, final float maxTermFrequency) {
return new BlendedTermQuery(terms) {
return new BlendedTermQuery(terms, boosts) {
@Override
protected Query topLevelQuery(Term[] terms, TermContext[] ctx, int[] docFreqs, int maxDoc) {
BooleanQuery query = new BooleanQuery(true);
@ -334,7 +353,7 @@ public abstract class BlendedTermQuery extends Query {
}
public static BlendedTermQuery dismaxBlendedQuery(Term[] terms, final float[] boosts, final float tieBreakerMultiplier) {
return new BlendedTermQuery(terms) {
return new BlendedTermQuery(terms, boosts) {
@Override
protected Query topLevelQuery(Term[] terms, TermContext[] ctx, int[] docFreqs, int maxDoc) {
DisjunctionMaxQuery query = new DisjunctionMaxQuery(tieBreakerMultiplier);

View File

@ -85,7 +85,7 @@ public class MultiMatchQuery extends MatchQuery {
throw new IllegalStateException("No such type: " + type);
}
final List<? extends Query> queries = queryBuilder.buildGroupedQueries(type, fieldNames, value, minimumShouldMatch);
return queryBuilder.conbineGrouped(queries);
return queryBuilder.combineGrouped(queries);
}
private QueryBuilder queryBuilder;
@ -119,7 +119,7 @@ public class MultiMatchQuery extends MatchQuery {
return parseAndApply(type, field, value, minimumShouldMatch, boostValue);
}
public Query conbineGrouped(List<? extends Query> groupQuery) {
public Query combineGrouped(List<? extends Query> groupQuery) {
if (groupQuery == null || groupQuery.isEmpty()) {
return null;
}
@ -196,7 +196,7 @@ public class MultiMatchQuery extends MatchQuery {
blendedFields = null;
}
final FieldAndFieldType fieldAndFieldType = group.get(0);
Query q = parseGroup(type.matchQueryType(), fieldAndFieldType.field, fieldAndFieldType.boost, value, minimumShouldMatch);
Query q = parseGroup(type.matchQueryType(), fieldAndFieldType.field, 1f, value, minimumShouldMatch);
if (q != null) {
queries.add(q);
}

View File

@ -54,6 +54,7 @@ import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentHelper;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.engine.Engine;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.ParsedDocument;
import org.elasticsearch.index.mapper.core.NumberFieldMapper;
@ -83,6 +84,7 @@ import static org.hamcrest.Matchers.*;
public class SimpleIndexQueryParserTests extends ESSingleNodeTestCase {
private IndexQueryParserService queryParser;
private IndexService indexService;
@Before
public void setup() throws IOException {
@ -99,6 +101,7 @@ public class SimpleIndexQueryParserTests extends ESSingleNodeTestCase {
assertNotNull(doc.dynamicMappingsUpdate());
client().admin().indices().preparePutMapping("test").setType("person").setSource(doc.dynamicMappingsUpdate().toString()).get();
this.indexService = indexService;
queryParser = indexService.queryParserService();
}
@ -2269,6 +2272,23 @@ public class SimpleIndexQueryParserTests extends ESSingleNodeTestCase {
assertThat(parsedQuery, instanceOf(BooleanQuery.class));
}
public void testCrossFieldMultiMatchQuery() throws IOException {
IndexQueryParserService queryParser = queryParser();
Query parsedQuery = queryParser.parse(multiMatchQuery("banon", "name.first^2", "name.last^3", "foobar").type(MultiMatchQueryBuilder.Type.CROSS_FIELDS)).query();
try (Engine.Searcher searcher = indexService.shardSafe(0).acquireSearcher("test")) {
Query rewrittenQuery = searcher.searcher().rewrite(parsedQuery);
BooleanQuery expected = new BooleanQuery();
expected.add(new TermQuery(new Term("foobar", "banon")), Occur.SHOULD);
TermQuery tq1 = new TermQuery(new Term("name.first", "banon"));
tq1.setBoost(2);
TermQuery tq2 = new TermQuery(new Term("name.last", "banon"));
tq2.setBoost(3);
expected.add(new DisjunctionMaxQuery(Arrays.<Query>asList(tq1, tq2), 0f), Occur.SHOULD);
assertEquals(expected, rewrittenQuery);
}
}
@Test
public void testSimpleQueryString() throws Exception {
IndexQueryParserService queryParser = queryParser();

View File

@ -302,10 +302,17 @@ document to match. (Compare this to
That solves one of the two problems. The problem of differing term frequencies
is solved by _blending_ the term frequencies for all fields in order to even
out the differences. In other words, `first_name:smith` will be treated as
though it has the same weight as `last_name:smith`. (Actually,
`last_name:smith` is given a tiny advantage over `first_name:smith`, just to
make the order of results more stable.)
out the differences.
In practice, `first_name:smith` will be treated as though it has the same
frequencies as `last_name:smith`, plus one. This will make matches on
`first_name` and `last_name` have comparable scores, with a tiny advantage
for `last_name` since it is the most likely field that contains `smith`.
Note that `cross_fields` is usually only useful on short string fields
that all have a `boost` of `1`. Otherwise boosts, term freqs and length
normalization contribute to the score in such a way that the blending of term
statistics is not meaningful anymore.
If you run the above query through the <<search-validate>>, it returns this
explanation: