From 9d6fac809c2f9daf94d9c20a8a206f03e6cbac58 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Wed, 2 Nov 2016 10:00:40 +0100 Subject: [PATCH] Expose splitOnWhitespace in `Query String Query` (#20965) This change adds an option called `split_on_whitespace` which prevents the query parser to split free text part on whitespace prior to analysis. Instead the queryparser would parse around only real 'operators'. Default to true. For instance the query `"foo bar"` would let the analyzer of the targeted field decide how the tokens should be splitted. Some options are missing in this change but I'd like to add them in a follow up PR in order to be able to simplify the backport in 5.x. The missing options (changes) are: * A `type` option which similarly to the `multi_match` query defines how the free text should be parsed when multi fields are defined. * Simple range query with additional tokens like ">100 50" are broken when `split_on_whitespace` is set to false. It should be possible to preserve this syntax and make the parser aware of this special syntax even when `split_on_whitespace` is set to false. * Since all this options would make the `query_string_query` very similar to a match (multi_match) query we should be able to share the code that produce the final Lucene query. --- .../classic/MapperQueryParser.java | 1 + .../classic/QueryParserSettings.java | 10 ++ .../index/query/QueryStringQueryBuilder.java | 39 +++++- .../java/org/elasticsearch/VersionTests.java | 2 + .../query/QueryStringQueryBuilderTests.java | 125 ++++++++++++++++++ .../query-dsl/query-string-query.asciidoc | 5 + .../query-dsl/query-string-syntax.asciidoc | 4 +- 7 files changed, 182 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java b/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java index b6f0020fb5b..bbb27aa11db 100644 --- a/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java +++ b/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java @@ -104,6 +104,7 @@ public class MapperQueryParser extends QueryParser { setDefaultOperator(settings.defaultOperator()); setFuzzyPrefixLength(settings.fuzzyPrefixLength()); setLocale(settings.locale()); + setSplitOnWhitespace(settings.splitOnWhitespace()); } /** diff --git a/core/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java b/core/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java index c1fc2ae556e..a03c84d8980 100644 --- a/core/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java +++ b/core/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java @@ -79,6 +79,8 @@ public class QueryParserSettings { /** To limit effort spent determinizing regexp queries. */ private int maxDeterminizedStates; + private boolean splitOnWhitespace; + public QueryParserSettings(String queryString) { this.queryString = queryString; } @@ -290,4 +292,12 @@ public class QueryParserSettings { public Fuzziness fuzziness() { return fuzziness; } + + public void splitOnWhitespace(boolean value) { + this.splitOnWhitespace = value; + } + + public boolean splitOnWhitespace() { + return splitOnWhitespace; + } } diff --git a/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java b/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java index 807343237d2..d88484c98a6 100644 --- a/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java +++ b/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java @@ -26,6 +26,7 @@ import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.Query; import org.apache.lucene.util.automaton.Operations; +import org.elasticsearch.Version; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.ParsingException; import org.elasticsearch.common.io.stream.StreamInput; @@ -59,6 +60,8 @@ import java.util.TreeMap; public class QueryStringQueryBuilder extends AbstractQueryBuilder { public static final String NAME = "query_string"; + public static final Version V_5_1_0_UNRELEASED = Version.fromId(5010099); + public static final boolean DEFAULT_AUTO_GENERATE_PHRASE_QUERIES = false; public static final int DEFAULT_MAX_DETERMINED_STATES = Operations.DEFAULT_MAX_DETERMINIZED_STATES; public static final boolean DEFAULT_LOWERCASE_EXPANDED_TERMS = true; @@ -72,6 +75,7 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder{@value #DEFAULT_SPLIT_ON_WHITESPACE}. + */ + public QueryStringQueryBuilder splitOnWhitespace(boolean value) { + this.splitOnWhitespace = value; + return this; + } + + public boolean splitOnWhitespace() { + return splitOnWhitespace; + } + @Override protected void doXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(NAME); @@ -626,6 +654,7 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder fieldsAndWeights = new HashMap<>(); while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { if (token == XContentParser.Token.FIELD_NAME) { @@ -750,6 +780,8 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder 0); + // splitOnWhitespace=false + { + QueryStringQueryBuilder queryBuilder = + new QueryStringQueryBuilder("foo bar") + .field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2) + .splitOnWhitespace(false); + Query query = queryBuilder.toQuery(createShardContext()); + BooleanQuery bq1 = + new BooleanQuery.Builder() + .add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "foo")), BooleanClause.Occur.SHOULD)) + .add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "bar")), BooleanClause.Occur.SHOULD)) + .build(); + List disjuncts = new ArrayList<>(); + disjuncts.add(bq1); + disjuncts.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo bar"))); + DisjunctionMaxQuery expectedQuery = new DisjunctionMaxQuery(disjuncts, 0.0f); + assertThat(query, equalTo(expectedQuery)); + } + + { + QueryStringQueryBuilder queryBuilder = + new QueryStringQueryBuilder("mapped_string:other foo bar") + .field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2) + .splitOnWhitespace(false); + Query query = queryBuilder.toQuery(createShardContext()); + BooleanQuery bq1 = + new BooleanQuery.Builder() + .add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "foo")), BooleanClause.Occur.SHOULD)) + .add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "bar")), BooleanClause.Occur.SHOULD)) + .build(); + List disjuncts = new ArrayList<>(); + disjuncts.add(bq1); + disjuncts.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo bar"))); + DisjunctionMaxQuery disjunctionMaxQuery = new DisjunctionMaxQuery(disjuncts, 0.0f); + BooleanQuery expectedQuery = + new BooleanQuery.Builder() + .add(disjunctionMaxQuery, BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term(STRING_FIELD_NAME, "other")), BooleanClause.Occur.SHOULD) + .build(); + assertThat(query, equalTo(expectedQuery)); + } + + { + QueryStringQueryBuilder queryBuilder = + new QueryStringQueryBuilder("foo OR bar") + .field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2) + .splitOnWhitespace(false); + Query query = queryBuilder.toQuery(createShardContext()); + + List disjuncts1 = new ArrayList<>(); + disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME, "foo"))); + disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo"))); + DisjunctionMaxQuery maxQuery1 = new DisjunctionMaxQuery(disjuncts1, 0.0f); + + List disjuncts2 = new ArrayList<>(); + disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME, "bar"))); + disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "bar"))); + DisjunctionMaxQuery maxQuery2 = new DisjunctionMaxQuery(disjuncts2, 0.0f); + + BooleanQuery expectedQuery = + new BooleanQuery.Builder() + .add(new BooleanClause(maxQuery1, BooleanClause.Occur.SHOULD)) + .add(new BooleanClause(maxQuery2, BooleanClause.Occur.SHOULD)) + .build(); + assertThat(query, equalTo(expectedQuery)); + } + + // split_on_whitespace=false breaks range query with simple syntax + { + // throws an exception when lenient is set to false + QueryStringQueryBuilder queryBuilder = + new QueryStringQueryBuilder(">10 foo") + .field(INT_FIELD_NAME) + .splitOnWhitespace(false); + IllegalArgumentException exc = + expectThrows(IllegalArgumentException.class, () -> queryBuilder.toQuery(createShardContext())); + assertThat(exc.getMessage(), equalTo("For input string: \"10 foo\"")); + } + + { + // returns an empty boolean query when lenient is set to true + QueryStringQueryBuilder queryBuilder = + new QueryStringQueryBuilder(">10 foo") + .field(INT_FIELD_NAME) + .splitOnWhitespace(false) + .lenient(true); + Query query = queryBuilder.toQuery(createShardContext()); + BooleanQuery bq = new BooleanQuery.Builder().build(); + assertThat(bq, equalTo(query)); + } + + // splitOnWhitespace=true + { + QueryStringQueryBuilder queryBuilder = + new QueryStringQueryBuilder("foo bar") + .field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2) + .splitOnWhitespace(true); + Query query = queryBuilder.toQuery(createShardContext()); + + List disjuncts1 = new ArrayList<>(); + disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME, "foo"))); + disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo"))); + DisjunctionMaxQuery maxQuery1 = new DisjunctionMaxQuery(disjuncts1, 0.0f); + + List disjuncts2 = new ArrayList<>(); + disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME, "bar"))); + disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "bar"))); + DisjunctionMaxQuery maxQuery2 = new DisjunctionMaxQuery(disjuncts2, 0.0f); + + BooleanQuery expectedQuery = + new BooleanQuery.Builder() + .add(new BooleanClause(maxQuery1, BooleanClause.Occur.SHOULD)) + .add(new BooleanClause(maxQuery2, BooleanClause.Occur.SHOULD)) + .build(); + assertThat(query, equalTo(expectedQuery)); + } + + + } + public void testFromJson() throws IOException { String json = "{\n" + @@ -552,6 +676,7 @@ public class QueryStringQueryBuilderTests extends AbstractQueryTestCase> for a comprehensive example. + +|`split_on_whitespace` |Whether query text should be split on whitespace prior to analysis. + Instead the queryparser would parse around only real 'operators'. + Default to `false`. + |======================================================================= When a multi term query is being generated, one can control how it gets diff --git a/docs/reference/query-dsl/query-string-syntax.asciidoc b/docs/reference/query-dsl/query-string-syntax.asciidoc index 9e847102469..49a8b54c506 100644 --- a/docs/reference/query-dsl/query-string-syntax.asciidoc +++ b/docs/reference/query-dsl/query-string-syntax.asciidoc @@ -282,8 +282,8 @@ A space may also be a reserved character. For instance, if you have a synonym list which converts `"wi fi"` to `"wifi"`, a `query_string` search for `"wi fi"` would fail. The query string parser would interpret your query as a search for `"wi OR fi"`, while the token stored in your -index is actually `"wifi"`. Escaping the space will protect it from -being touched by the query string parser: `"wi\ fi"`. +index is actually `"wifi"`. The option `split_on_whitespace=false` will protect it from +being touched by the query string parser and will let the analysis run on the entire input (`"wi fi"`). **** ===== Empty Query