Expose splitOnWhitespace in `Query String Query` (#20965)
This change adds an option called `split_on_whitespace` which prevents the query parser to split free text part on whitespace prior to analysis. Instead the queryparser would parse around only real 'operators'. Default to true. For instance the query `"foo bar"` would let the analyzer of the targeted field decide how the tokens should be splitted. Some options are missing in this change but I'd like to add them in a follow up PR in order to be able to simplify the backport in 5.x. The missing options (changes) are: * A `type` option which similarly to the `multi_match` query defines how the free text should be parsed when multi fields are defined. * Simple range query with additional tokens like ">100 50" are broken when `split_on_whitespace` is set to false. It should be possible to preserve this syntax and make the parser aware of this special syntax even when `split_on_whitespace` is set to false. * Since all this options would make the `query_string_query` very similar to a match (multi_match) query we should be able to share the code that produce the final Lucene query.
This commit is contained in:
parent
aa6cd93e0f
commit
9d6fac809c
|
@ -104,6 +104,7 @@ public class MapperQueryParser extends QueryParser {
|
|||
setDefaultOperator(settings.defaultOperator());
|
||||
setFuzzyPrefixLength(settings.fuzzyPrefixLength());
|
||||
setLocale(settings.locale());
|
||||
setSplitOnWhitespace(settings.splitOnWhitespace());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -79,6 +79,8 @@ public class QueryParserSettings {
|
|||
/** To limit effort spent determinizing regexp queries. */
|
||||
private int maxDeterminizedStates;
|
||||
|
||||
private boolean splitOnWhitespace;
|
||||
|
||||
public QueryParserSettings(String queryString) {
|
||||
this.queryString = queryString;
|
||||
}
|
||||
|
@ -290,4 +292,12 @@ public class QueryParserSettings {
|
|||
public Fuzziness fuzziness() {
|
||||
return fuzziness;
|
||||
}
|
||||
|
||||
public void splitOnWhitespace(boolean value) {
|
||||
this.splitOnWhitespace = value;
|
||||
}
|
||||
|
||||
public boolean splitOnWhitespace() {
|
||||
return splitOnWhitespace;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.search.BoostQuery;
|
|||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.ParsingException;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
|
@ -59,6 +60,8 @@ import java.util.TreeMap;
|
|||
public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQueryBuilder> {
|
||||
public static final String NAME = "query_string";
|
||||
|
||||
public static final Version V_5_1_0_UNRELEASED = Version.fromId(5010099);
|
||||
|
||||
public static final boolean DEFAULT_AUTO_GENERATE_PHRASE_QUERIES = false;
|
||||
public static final int DEFAULT_MAX_DETERMINED_STATES = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
public static final boolean DEFAULT_LOWERCASE_EXPANDED_TERMS = true;
|
||||
|
@ -72,6 +75,7 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
|
|||
public static final Fuzziness DEFAULT_FUZZINESS = Fuzziness.AUTO;
|
||||
public static final Operator DEFAULT_OPERATOR = Operator.OR;
|
||||
public static final Locale DEFAULT_LOCALE = Locale.ROOT;
|
||||
public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = true;
|
||||
|
||||
private static final ParseField QUERY_FIELD = new ParseField("query");
|
||||
private static final ParseField FIELDS_FIELD = new ParseField("fields");
|
||||
|
@ -98,6 +102,7 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
|
|||
private static final ParseField LENIENT_FIELD = new ParseField("lenient");
|
||||
private static final ParseField LOCALE_FIELD = new ParseField("locale");
|
||||
private static final ParseField TIME_ZONE_FIELD = new ParseField("time_zone");
|
||||
private static final ParseField SPLIT_ON_WHITESPACE = new ParseField("split_on_whitespace");
|
||||
|
||||
|
||||
private final String queryString;
|
||||
|
@ -159,6 +164,8 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
|
|||
/** To limit effort spent determinizing regexp queries. */
|
||||
private int maxDeterminizedStates = DEFAULT_MAX_DETERMINED_STATES;
|
||||
|
||||
private boolean splitOnWhitespace = DEFAULT_SPLIT_ON_WHITESPACE;
|
||||
|
||||
public QueryStringQueryBuilder(String queryString) {
|
||||
if (queryString == null) {
|
||||
throw new IllegalArgumentException("query text missing");
|
||||
|
@ -200,6 +207,11 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
|
|||
timeZone = in.readOptionalTimeZone();
|
||||
escape = in.readBoolean();
|
||||
maxDeterminizedStates = in.readVInt();
|
||||
if (in.getVersion().onOrAfter(V_5_1_0_UNRELEASED)) {
|
||||
splitOnWhitespace = in.readBoolean();
|
||||
} else {
|
||||
splitOnWhitespace = DEFAULT_SPLIT_ON_WHITESPACE;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -234,6 +246,9 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
|
|||
out.writeOptionalTimeZone(timeZone);
|
||||
out.writeBoolean(this.escape);
|
||||
out.writeVInt(this.maxDeterminizedStates);
|
||||
if (out.getVersion().onOrAfter(V_5_1_0_UNRELEASED)) {
|
||||
out.writeBoolean(this.splitOnWhitespace);
|
||||
}
|
||||
}
|
||||
|
||||
public String queryString() {
|
||||
|
@ -570,6 +585,19 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
|
|||
return this.escape;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether query text should be split on whitespace prior to analysis.
|
||||
* Default is <code>{@value #DEFAULT_SPLIT_ON_WHITESPACE}</code>.
|
||||
*/
|
||||
public QueryStringQueryBuilder splitOnWhitespace(boolean value) {
|
||||
this.splitOnWhitespace = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
public boolean splitOnWhitespace() {
|
||||
return splitOnWhitespace;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doXContent(XContentBuilder builder, Params params) throws IOException {
|
||||
builder.startObject(NAME);
|
||||
|
@ -626,6 +654,7 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
|
|||
builder.field(TIME_ZONE_FIELD.getPreferredName(), this.timeZone.getID());
|
||||
}
|
||||
builder.field(ESCAPE_FIELD.getPreferredName(), this.escape);
|
||||
builder.field(SPLIT_ON_WHITESPACE.getPreferredName(), this.splitOnWhitespace);
|
||||
printBoostAndQueryName(builder);
|
||||
builder.endObject();
|
||||
}
|
||||
|
@ -661,6 +690,7 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
|
|||
Fuzziness fuzziness = QueryStringQueryBuilder.DEFAULT_FUZZINESS;
|
||||
String fuzzyRewrite = null;
|
||||
String rewrite = null;
|
||||
boolean splitOnWhitespace = DEFAULT_SPLIT_ON_WHITESPACE;
|
||||
Map<String, Float> fieldsAndWeights = new HashMap<>();
|
||||
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
|
||||
if (token == XContentParser.Token.FIELD_NAME) {
|
||||
|
@ -750,6 +780,8 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
|
|||
}
|
||||
} else if (parseContext.getParseFieldMatcher().match(currentFieldName, AbstractQueryBuilder.NAME_FIELD)) {
|
||||
queryName = parser.text();
|
||||
} else if (parseContext.getParseFieldMatcher().match(currentFieldName, SPLIT_ON_WHITESPACE)) {
|
||||
splitOnWhitespace = parser.booleanValue();
|
||||
} else {
|
||||
throw new ParsingException(parser.getTokenLocation(), "[" + QueryStringQueryBuilder.NAME +
|
||||
"] query does not support [" + currentFieldName + "]");
|
||||
|
@ -791,6 +823,7 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
|
|||
queryStringQuery.locale(locale);
|
||||
queryStringQuery.boost(boost);
|
||||
queryStringQuery.queryName(queryName);
|
||||
queryStringQuery.splitOnWhitespace(splitOnWhitespace);
|
||||
return Optional.of(queryStringQuery);
|
||||
}
|
||||
|
||||
|
@ -827,7 +860,8 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
|
|||
timeZone == null ? other.timeZone == null : other.timeZone != null &&
|
||||
Objects.equals(timeZone.getID(), other.timeZone.getID()) &&
|
||||
Objects.equals(escape, other.escape) &&
|
||||
Objects.equals(maxDeterminizedStates, other.maxDeterminizedStates);
|
||||
Objects.equals(maxDeterminizedStates, other.maxDeterminizedStates) &&
|
||||
Objects.equals(splitOnWhitespace, other.splitOnWhitespace);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -836,7 +870,7 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
|
|||
quoteFieldSuffix, autoGeneratePhraseQueries, allowLeadingWildcard, lowercaseExpandedTerms,
|
||||
enablePositionIncrements, analyzeWildcard, locale.toLanguageTag(), fuzziness, fuzzyPrefixLength,
|
||||
fuzzyMaxExpansions, fuzzyRewrite, phraseSlop, useDisMax, tieBreaker, rewrite, minimumShouldMatch, lenient,
|
||||
timeZone == null ? 0 : timeZone.getID(), escape, maxDeterminizedStates);
|
||||
timeZone == null ? 0 : timeZone.getID(), escape, maxDeterminizedStates, splitOnWhitespace);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -904,6 +938,7 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
|
|||
qpSettings.lenient(lenient == null ? context.queryStringLenient() : lenient);
|
||||
qpSettings.timeZone(timeZone);
|
||||
qpSettings.maxDeterminizedStates(maxDeterminizedStates);
|
||||
qpSettings.splitOnWhitespace(splitOnWhitespace);
|
||||
|
||||
MapperQueryParser queryParser = context.queryParser(qpSettings);
|
||||
Query query;
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.elasticsearch.action.ShardValidateQueryRequestTests;
|
|||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.query.QueryStringQueryBuilder;
|
||||
import org.elasticsearch.monitor.os.OsStats;
|
||||
import org.elasticsearch.index.query.SimpleQueryStringBuilder;
|
||||
import org.elasticsearch.search.internal.AliasFilter;
|
||||
|
@ -275,6 +276,7 @@ public class VersionTests extends ESTestCase {
|
|||
assertUnknownVersion(AliasFilter.V_5_1_0); // once we released 5.1.0 and it's added to Version.java we need to remove this constant
|
||||
assertUnknownVersion(OsStats.V_5_1_0); // once we released 5.1.0 and it's added to Version.java we need to remove this constant
|
||||
assertUnknownVersion(SimpleQueryStringBuilder.V_5_1_0_UNRELEASED);
|
||||
assertUnknownVersion(QueryStringQueryBuilder.V_5_1_0_UNRELEASED);
|
||||
// once we released 5.0.0 and it's added to Version.java we need to remove this constant
|
||||
}
|
||||
|
||||
|
|
|
@ -45,6 +45,7 @@ import org.hamcrest.Matchers;
|
|||
import org.joda.time.DateTimeZone;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
|
@ -151,6 +152,7 @@ public class QueryStringQueryBuilderTests extends AbstractQueryTestCase<QueryStr
|
|||
if (randomBoolean()) {
|
||||
queryStringQueryBuilder.timeZone(randomDateTimeZone().getID());
|
||||
}
|
||||
queryStringQueryBuilder.splitOnWhitespace(randomBoolean());
|
||||
return queryStringQueryBuilder;
|
||||
}
|
||||
|
||||
|
@ -532,6 +534,128 @@ public class QueryStringQueryBuilderTests extends AbstractQueryTestCase<QueryStr
|
|||
assertThat(phraseQuery.getTerms().length, equalTo(2));
|
||||
}
|
||||
|
||||
public void testToQuerySplitOnWhitespace() throws IOException {
|
||||
assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
|
||||
// splitOnWhitespace=false
|
||||
{
|
||||
QueryStringQueryBuilder queryBuilder =
|
||||
new QueryStringQueryBuilder("foo bar")
|
||||
.field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2)
|
||||
.splitOnWhitespace(false);
|
||||
Query query = queryBuilder.toQuery(createShardContext());
|
||||
BooleanQuery bq1 =
|
||||
new BooleanQuery.Builder()
|
||||
.add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "foo")), BooleanClause.Occur.SHOULD))
|
||||
.add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "bar")), BooleanClause.Occur.SHOULD))
|
||||
.build();
|
||||
List<Query> disjuncts = new ArrayList<>();
|
||||
disjuncts.add(bq1);
|
||||
disjuncts.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo bar")));
|
||||
DisjunctionMaxQuery expectedQuery = new DisjunctionMaxQuery(disjuncts, 0.0f);
|
||||
assertThat(query, equalTo(expectedQuery));
|
||||
}
|
||||
|
||||
{
|
||||
QueryStringQueryBuilder queryBuilder =
|
||||
new QueryStringQueryBuilder("mapped_string:other foo bar")
|
||||
.field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2)
|
||||
.splitOnWhitespace(false);
|
||||
Query query = queryBuilder.toQuery(createShardContext());
|
||||
BooleanQuery bq1 =
|
||||
new BooleanQuery.Builder()
|
||||
.add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "foo")), BooleanClause.Occur.SHOULD))
|
||||
.add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "bar")), BooleanClause.Occur.SHOULD))
|
||||
.build();
|
||||
List<Query> disjuncts = new ArrayList<>();
|
||||
disjuncts.add(bq1);
|
||||
disjuncts.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo bar")));
|
||||
DisjunctionMaxQuery disjunctionMaxQuery = new DisjunctionMaxQuery(disjuncts, 0.0f);
|
||||
BooleanQuery expectedQuery =
|
||||
new BooleanQuery.Builder()
|
||||
.add(disjunctionMaxQuery, BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term(STRING_FIELD_NAME, "other")), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
assertThat(query, equalTo(expectedQuery));
|
||||
}
|
||||
|
||||
{
|
||||
QueryStringQueryBuilder queryBuilder =
|
||||
new QueryStringQueryBuilder("foo OR bar")
|
||||
.field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2)
|
||||
.splitOnWhitespace(false);
|
||||
Query query = queryBuilder.toQuery(createShardContext());
|
||||
|
||||
List<Query> disjuncts1 = new ArrayList<>();
|
||||
disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME, "foo")));
|
||||
disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo")));
|
||||
DisjunctionMaxQuery maxQuery1 = new DisjunctionMaxQuery(disjuncts1, 0.0f);
|
||||
|
||||
List<Query> disjuncts2 = new ArrayList<>();
|
||||
disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME, "bar")));
|
||||
disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "bar")));
|
||||
DisjunctionMaxQuery maxQuery2 = new DisjunctionMaxQuery(disjuncts2, 0.0f);
|
||||
|
||||
BooleanQuery expectedQuery =
|
||||
new BooleanQuery.Builder()
|
||||
.add(new BooleanClause(maxQuery1, BooleanClause.Occur.SHOULD))
|
||||
.add(new BooleanClause(maxQuery2, BooleanClause.Occur.SHOULD))
|
||||
.build();
|
||||
assertThat(query, equalTo(expectedQuery));
|
||||
}
|
||||
|
||||
// split_on_whitespace=false breaks range query with simple syntax
|
||||
{
|
||||
// throws an exception when lenient is set to false
|
||||
QueryStringQueryBuilder queryBuilder =
|
||||
new QueryStringQueryBuilder(">10 foo")
|
||||
.field(INT_FIELD_NAME)
|
||||
.splitOnWhitespace(false);
|
||||
IllegalArgumentException exc =
|
||||
expectThrows(IllegalArgumentException.class, () -> queryBuilder.toQuery(createShardContext()));
|
||||
assertThat(exc.getMessage(), equalTo("For input string: \"10 foo\""));
|
||||
}
|
||||
|
||||
{
|
||||
// returns an empty boolean query when lenient is set to true
|
||||
QueryStringQueryBuilder queryBuilder =
|
||||
new QueryStringQueryBuilder(">10 foo")
|
||||
.field(INT_FIELD_NAME)
|
||||
.splitOnWhitespace(false)
|
||||
.lenient(true);
|
||||
Query query = queryBuilder.toQuery(createShardContext());
|
||||
BooleanQuery bq = new BooleanQuery.Builder().build();
|
||||
assertThat(bq, equalTo(query));
|
||||
}
|
||||
|
||||
// splitOnWhitespace=true
|
||||
{
|
||||
QueryStringQueryBuilder queryBuilder =
|
||||
new QueryStringQueryBuilder("foo bar")
|
||||
.field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2)
|
||||
.splitOnWhitespace(true);
|
||||
Query query = queryBuilder.toQuery(createShardContext());
|
||||
|
||||
List<Query> disjuncts1 = new ArrayList<>();
|
||||
disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME, "foo")));
|
||||
disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo")));
|
||||
DisjunctionMaxQuery maxQuery1 = new DisjunctionMaxQuery(disjuncts1, 0.0f);
|
||||
|
||||
List<Query> disjuncts2 = new ArrayList<>();
|
||||
disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME, "bar")));
|
||||
disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "bar")));
|
||||
DisjunctionMaxQuery maxQuery2 = new DisjunctionMaxQuery(disjuncts2, 0.0f);
|
||||
|
||||
BooleanQuery expectedQuery =
|
||||
new BooleanQuery.Builder()
|
||||
.add(new BooleanClause(maxQuery1, BooleanClause.Occur.SHOULD))
|
||||
.add(new BooleanClause(maxQuery2, BooleanClause.Occur.SHOULD))
|
||||
.build();
|
||||
assertThat(query, equalTo(expectedQuery));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
public void testFromJson() throws IOException {
|
||||
String json =
|
||||
"{\n" +
|
||||
|
@ -552,6 +676,7 @@ public class QueryStringQueryBuilderTests extends AbstractQueryTestCase<QueryStr
|
|||
" \"phrase_slop\" : 0,\n" +
|
||||
" \"locale\" : \"und\",\n" +
|
||||
" \"escape\" : false,\n" +
|
||||
" \"split_on_whitespace\" : true,\n" +
|
||||
" \"boost\" : 1.0\n" +
|
||||
" }\n" +
|
||||
"}";
|
||||
|
|
|
@ -90,6 +90,11 @@ http://www.joda.org/joda-time/apidocs/org/joda/time/DateTimeZone.html[JODA timez
|
|||
the query string. This allows to use a field that has a different analysis chain
|
||||
for exact matching. Look <<mixing-exact-search-with-stemming,here>> for a
|
||||
comprehensive example.
|
||||
|
||||
|`split_on_whitespace` |Whether query text should be split on whitespace prior to analysis.
|
||||
Instead the queryparser would parse around only real 'operators'.
|
||||
Default to `false`.
|
||||
|
||||
|=======================================================================
|
||||
|
||||
When a multi term query is being generated, one can control how it gets
|
||||
|
|
|
@ -282,8 +282,8 @@ A space may also be a reserved character. For instance, if you have a
|
|||
synonym list which converts `"wi fi"` to `"wifi"`, a `query_string` search
|
||||
for `"wi fi"` would fail. The query string parser would interpret your
|
||||
query as a search for `"wi OR fi"`, while the token stored in your
|
||||
index is actually `"wifi"`. Escaping the space will protect it from
|
||||
being touched by the query string parser: `"wi\ fi"`.
|
||||
index is actually `"wifi"`. The option `split_on_whitespace=false` will protect it from
|
||||
being touched by the query string parser and will let the analysis run on the entire input (`"wi fi"`).
|
||||
****
|
||||
|
||||
===== Empty Query
|
||||
|
|
Loading…
Reference in New Issue