Expose Lucene's new TopTermsBlendedFreqScoringRewrite.

This rewrite method is interesting because it computes scores as if all terms
had the same frequencies, which avoids disappointments with ranking when a fuzzy
query ranks typos first given that they are less frequent than the correct term.
This commit is contained in:
Adrien Grand 2015-07-08 16:01:47 +02:00
parent 9519100eca
commit 8238f497d8
10 changed files with 81 additions and 50 deletions

View File

@ -95,7 +95,7 @@ public class FuzzyQueryParser implements QueryParser {
} else if ("transpositions".equals(currentFieldName)) {
transpositions = parser.booleanValue();
} else if ("rewrite".equals(currentFieldName)) {
rewriteMethod = QueryParsers.parseRewriteMethod(parser.textOrNull(), null);
rewriteMethod = QueryParsers.parseRewriteMethod(parseContext.parseFieldMatcher(), parser.textOrNull(), null);
} else if ("_name".equals(currentFieldName)) {
queryName = parser.text();
} else {

View File

@ -124,9 +124,9 @@ public class MatchQueryParser implements QueryParser {
} else if ("minimum_should_match".equals(currentFieldName) || "minimumShouldMatch".equals(currentFieldName)) {
minimumShouldMatch = parser.textOrNull();
} else if ("rewrite".equals(currentFieldName)) {
matchQuery.setRewriteMethod(QueryParsers.parseRewriteMethod(parser.textOrNull(), null));
matchQuery.setRewriteMethod(QueryParsers.parseRewriteMethod(parseContext.parseFieldMatcher(), parser.textOrNull(), null));
} else if ("fuzzy_rewrite".equals(currentFieldName) || "fuzzyRewrite".equals(currentFieldName)) {
matchQuery.setFuzzyRewriteMethod(QueryParsers.parseRewriteMethod(parser.textOrNull(), null));
matchQuery.setFuzzyRewriteMethod(QueryParsers.parseRewriteMethod(parseContext.parseFieldMatcher(), parser.textOrNull(), null));
} else if ("fuzzy_transpositions".equals(currentFieldName)) {
matchQuery.setTranspositions(parser.booleanValue());
} else if ("lenient".equals(currentFieldName)) {

View File

@ -114,9 +114,9 @@ public class MultiMatchQueryParser implements QueryParser {
} else if ("minimum_should_match".equals(currentFieldName) || "minimumShouldMatch".equals(currentFieldName)) {
minimumShouldMatch = parser.textOrNull();
} else if ("rewrite".equals(currentFieldName)) {
multiMatchQuery.setRewriteMethod(QueryParsers.parseRewriteMethod(parser.textOrNull(), null));
multiMatchQuery.setRewriteMethod(QueryParsers.parseRewriteMethod(parseContext.parseFieldMatcher(), parser.textOrNull(), null));
} else if ("fuzzy_rewrite".equals(currentFieldName) || "fuzzyRewrite".equals(currentFieldName)) {
multiMatchQuery.setFuzzyRewriteMethod(QueryParsers.parseRewriteMethod(parser.textOrNull(), null));
multiMatchQuery.setFuzzyRewriteMethod(QueryParsers.parseRewriteMethod(parseContext.parseFieldMatcher(), parser.textOrNull(), null));
} else if ("use_dis_max".equals(currentFieldName) || "useDisMax".equals(currentFieldName)) {
useDisMax = parser.booleanValue();
} else if ("tie_breaker".equals(currentFieldName) || "tieBreaker".equals(currentFieldName)) {

View File

@ -97,7 +97,7 @@ public class PrefixQueryParser implements QueryParser {
throw new QueryParsingException(parseContext, "No value specified for prefix query");
}
MultiTermQuery.RewriteMethod method = QueryParsers.parseRewriteMethod(rewriteMethod, null);
MultiTermQuery.RewriteMethod method = QueryParsers.parseRewriteMethod(parseContext.parseFieldMatcher(), rewriteMethod, null);
Query query = null;
MappedFieldType fieldType = parseContext.fieldMapper(fieldName);

View File

@ -175,7 +175,7 @@ public class QueryStringQueryParser implements QueryParser {
} else if ("fuzzy_max_expansions".equals(currentFieldName) || "fuzzyMaxExpansions".equals(currentFieldName)) {
qpSettings.fuzzyMaxExpansions(parser.intValue());
} else if ("fuzzy_rewrite".equals(currentFieldName) || "fuzzyRewrite".equals(currentFieldName)) {
qpSettings.fuzzyRewriteMethod(QueryParsers.parseRewriteMethod(parser.textOrNull()));
qpSettings.fuzzyRewriteMethod(QueryParsers.parseRewriteMethod(parseContext.parseFieldMatcher(), parser.textOrNull()));
} else if ("phrase_slop".equals(currentFieldName) || "phraseSlop".equals(currentFieldName)) {
qpSettings.phraseSlop(parser.intValue());
} else if (parseContext.parseFieldMatcher().match(currentFieldName, FUZZINESS)) {
@ -187,7 +187,7 @@ public class QueryStringQueryParser implements QueryParser {
} else if ("analyze_wildcard".equals(currentFieldName) || "analyzeWildcard".equals(currentFieldName)) {
qpSettings.analyzeWildcard(parser.booleanValue());
} else if ("rewrite".equals(currentFieldName)) {
qpSettings.rewriteMethod(QueryParsers.parseRewriteMethod(parser.textOrNull()));
qpSettings.rewriteMethod(QueryParsers.parseRewriteMethod(parseContext.parseFieldMatcher(), parser.textOrNull()));
} else if ("minimum_should_match".equals(currentFieldName) || "minimumShouldMatch".equals(currentFieldName)) {
qpSettings.minimumShouldMatch(parser.textOrNull());
} else if ("quote_field_suffix".equals(currentFieldName) || "quoteFieldSuffix".equals(currentFieldName)) {

View File

@ -109,7 +109,7 @@ public class RegexpQueryParser implements QueryParser {
throw new QueryParsingException(parseContext, "No value specified for regexp query");
}
MultiTermQuery.RewriteMethod method = QueryParsers.parseRewriteMethod(rewriteMethod, null);
MultiTermQuery.RewriteMethod method = QueryParsers.parseRewriteMethod(parseContext.parseFieldMatcher(), rewriteMethod, null);
Query query = null;
MappedFieldType fieldType = parseContext.fieldMapper(fieldName);

View File

@ -25,7 +25,6 @@ import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.query.support.QueryParsers;
@ -103,8 +102,8 @@ public class WildcardQueryParser implements QueryParser {
}
WildcardQuery wildcardQuery = new WildcardQuery(new Term(fieldName, valueBytes));
QueryParsers.setRewriteMethod(wildcardQuery, rewriteMethod);
wildcardQuery.setRewriteMethod(QueryParsers.parseRewriteMethod(rewriteMethod));
QueryParsers.setRewriteMethod(wildcardQuery, parseContext.parseFieldMatcher(), rewriteMethod);
wildcardQuery.setRewriteMethod(QueryParsers.parseRewriteMethod(parseContext.parseFieldMatcher(), rewriteMethod));
wildcardQuery.setBoost(boost);
if (queryName != null) {
parseContext.addNamedQuery(queryName, wildcardQuery);

View File

@ -20,14 +20,22 @@
package org.elasticsearch.index.query.support;
import org.apache.lucene.search.MultiTermQuery;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.ParseFieldMatcher;
/**
*
*/
public final class QueryParsers {
private static final ParseField CONSTANT_SCORE = new ParseField("constant_score", "constant_score_auto", "constant_score_filter");
private static final ParseField SCORING_BOOLEAN = new ParseField("scoring_boolean");
private static final ParseField CONSTANT_SCORE_BOOLEAN = new ParseField("constant_score_boolean");
private static final ParseField TOP_TERMS = new ParseField("top_terms_");
private static final ParseField TOP_TERMS_BOOST = new ParseField("top_terms_boost_");
private static final ParseField TOP_TERMS_BLENDED_FREQS = new ParseField("top_terms_blended_freqs_");
private QueryParsers() {
}
@ -39,50 +47,55 @@ public final class QueryParsers {
query.setRewriteMethod(rewriteMethod);
}
public static void setRewriteMethod(MultiTermQuery query, @Nullable String rewriteMethod) {
public static void setRewriteMethod(MultiTermQuery query, ParseFieldMatcher matcher, @Nullable String rewriteMethod) {
if (rewriteMethod == null) {
return;
}
query.setRewriteMethod(parseRewriteMethod(rewriteMethod));
query.setRewriteMethod(parseRewriteMethod(matcher, rewriteMethod));
}
public static MultiTermQuery.RewriteMethod parseRewriteMethod(@Nullable String rewriteMethod) {
return parseRewriteMethod(rewriteMethod, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
public static MultiTermQuery.RewriteMethod parseRewriteMethod(ParseFieldMatcher matcher, @Nullable String rewriteMethod) {
return parseRewriteMethod(matcher, rewriteMethod, MultiTermQuery.CONSTANT_SCORE_REWRITE);
}
public static MultiTermQuery.RewriteMethod parseRewriteMethod(@Nullable String rewriteMethod, @Nullable MultiTermQuery.RewriteMethod defaultRewriteMethod) {
public static MultiTermQuery.RewriteMethod parseRewriteMethod(ParseFieldMatcher matcher, @Nullable String rewriteMethod, @Nullable MultiTermQuery.RewriteMethod defaultRewriteMethod) {
if (rewriteMethod == null) {
return defaultRewriteMethod;
}
if ("constant_score_auto".equals(rewriteMethod) || "constant_score_auto".equals(rewriteMethod)) {
return MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE;
if (matcher.match(rewriteMethod, CONSTANT_SCORE)) {
return MultiTermQuery.CONSTANT_SCORE_REWRITE;
}
if ("scoring_boolean".equals(rewriteMethod) || "scoringBoolean".equals(rewriteMethod)) {
return MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE;
if (matcher.match(rewriteMethod, SCORING_BOOLEAN)) {
return MultiTermQuery.SCORING_BOOLEAN_REWRITE;
}
if ("constant_score_boolean".equals(rewriteMethod) || "constantScoreBoolean".equals(rewriteMethod)) {
return MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE;
if (matcher.match(rewriteMethod, CONSTANT_SCORE_BOOLEAN)) {
return MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE;
}
if ("constant_score_filter".equals(rewriteMethod) || "constantScoreFilter".equals(rewriteMethod)) {
return MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE;
int firstDigit = -1;
for (int i = 0; i < rewriteMethod.length(); ++i) {
if (Character.isDigit(rewriteMethod.charAt(i))) {
firstDigit = i;
break;
}
}
if (rewriteMethod.startsWith("top_terms_boost_")) {
int size = Integer.parseInt(rewriteMethod.substring("top_terms_boost_".length()));
return new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(size);
}
if (rewriteMethod.startsWith("topTermsBoost")) {
int size = Integer.parseInt(rewriteMethod.substring("topTermsBoost".length()));
return new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(size);
}
if (rewriteMethod.startsWith("top_terms_")) {
int size = Integer.parseInt(rewriteMethod.substring("top_terms_".length()));
return new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(size);
}
if (rewriteMethod.startsWith("topTerms")) {
int size = Integer.parseInt(rewriteMethod.substring("topTerms".length()));
return new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(size);
if (firstDigit >= 0) {
final int size = Integer.parseInt(rewriteMethod.substring(firstDigit));
String rewriteMethodName = rewriteMethod.substring(0, firstDigit);
if (matcher.match(rewriteMethodName, TOP_TERMS)) {
return new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(size);
}
if (matcher.match(rewriteMethodName, TOP_TERMS_BOOST)) {
return new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(size);
}
if (matcher.match(rewriteMethodName, TOP_TERMS_BLENDED_FREQS)) {
return new MultiTermQuery.TopTermsBlendedFreqScoringRewrite(size);
}
}
throw new IllegalArgumentException("Failed to parse rewrite_method [" + rewriteMethod + "]");
}
}

View File

@ -21,6 +21,7 @@ package org.elasticsearch.index.query;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.index.*;
import org.apache.lucene.index.memory.MemoryIndex;
@ -29,6 +30,7 @@ import org.apache.lucene.queries.ExtendedCommonTermsQuery;
import org.apache.lucene.queries.TermsQuery;
import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
import org.apache.lucene.search.spans.*;
import org.apache.lucene.spatial.prefix.IntersectsPrefixTreeFilter;
@ -68,6 +70,7 @@ import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.List;
@ -428,6 +431,7 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
assertThat(parsedQuery, instanceOf(FuzzyQuery.class));
FuzzyQuery fuzzyQuery = (FuzzyQuery) parsedQuery;
assertThat(fuzzyQuery.getTerm(), equalTo(new Term("name.first", "sh")));
assertThat(fuzzyQuery.getRewriteMethod(), instanceOf(MultiTermQuery.TopTermsBlendedFreqScoringRewrite.class));
}
@Test
@ -2423,4 +2427,16 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
q = csq.getQuery();
assertThat(q, instanceOf(TermsQuery.class));
}
@Test
public void testBlendedRewriteMethod() throws IOException {
IndexQueryParserService queryParser = queryParser();
for (String rewrite : Arrays.asList("top_terms_blended_freqs_10", "topTermsBlendedFreqs10")) {
Query parsedQuery = queryParser.parse(prefixQuery("field", "val").rewrite(rewrite)).query();
assertThat(parsedQuery, instanceOf(PrefixQuery.class));
PrefixQuery prefixQuery = (PrefixQuery) parsedQuery;
assertThat(prefixQuery.getPrefix(), equalTo(new Term("field", "val")));
assertThat(prefixQuery.getRewriteMethod(), instanceOf(MultiTermQuery.TopTermsBlendedFreqScoringRewrite.class));
}
}
}

View File

@ -10,9 +10,11 @@ also happens on the
All of those queries allow to control how they will get rewritten using
the `rewrite` parameter:
* When not set, or set to `constant_score_auto`, defaults to
automatically choosing either `constant_score_boolean` or
`constant_score_filter` based on query characteristics.
* `constant_score` (default): A rewrite method that performs like
`constant_score_boolean` when there are few matching terms and otherwise
visits all matching terms in sequence and marks documents for that term.
Matching documents are assigned a constant score equal to the query's
boost.
* `scoring_boolean`: A rewrite method that first translates each term
into a should clause in a boolean query, and keeps the scores as
computed by the query. Note that typically such scores are meaningless
@ -25,10 +27,6 @@ are not computed. Instead, each matching document receives a constant
score equal to the query's boost. This rewrite method will hit too many
clauses failure if it exceeds the boolean query limit (defaults to
`1024`).
* `constant_score_filter`: A rewrite method that first creates a private
Filter by visiting each term in sequence and marking all docs for that
term. Matching documents are assigned a constant score equal to the
query's boost.
* `top_terms_N`: A rewrite method that first translates each term into
should clause in boolean query, and keeps the scores as computed by the
query. This rewrite method only uses the top scoring terms so it will
@ -39,4 +37,9 @@ into should clause in boolean query, but the scores are only computed as
the boost. This rewrite method only uses the top scoring terms so it
will not overflow the boolean max clause count. The `N` controls the
size of the top scoring terms to use.
* `top_terms_blended_freqs_N`: A rewrite method that first translates each
term into should clause in boolean query, but all term queries compute scores
as if they had the same frequency. In practice the frequency which is used
is the maximum frequency of all matching terms. This rewrite method only uses
the top scoring terms so it will not overflow boolean max clause count. The
`N` controls the size of the top scoring terms to use.