From 30c80319c05362fae49fdfe5d6422c59f942f0db Mon Sep 17 00:00:00 2001 From: Clinton Gormley Date: Fri, 20 Jun 2014 12:42:43 +0200 Subject: [PATCH] Match query with operator and, cutoff_frequency and stacked tokens If the match query with cutoff_frequency encounters stacked tokens, like synonyms in the same position, it returns a boolean query instead of a common terms query. However, if the original operator was set to "and", it was ignoring that and resetting the operator to "or". In fact, if operator is "and" then there is little benefit in using a common terms query as a must query is already executed efficiently. --- .../query-dsl/queries/match-query.asciidoc | 7 -- .../index/search/MatchQuery.java | 2 +- .../search/query/SimpleQueryTests.java | 102 +++++++++++++++++- 3 files changed, 101 insertions(+), 10 deletions(-) diff --git a/docs/reference/query-dsl/queries/match-query.asciidoc b/docs/reference/query-dsl/queries/match-query.asciidoc index 350d078361f..16fd1afc43a 100644 --- a/docs/reference/query-dsl/queries/match-query.asciidoc +++ b/docs/reference/query-dsl/queries/match-query.asciidoc @@ -98,13 +98,6 @@ The `cutoff_frequency` can either be relative to the number of documents in the index if in the range `[0..1)` or absolute if greater or equal to `1.0`. -Note: If the `cutoff_frequency` is used and the operator is `and` -_stacked tokens_ (tokens that are on the same position like `synonym` filter emits) -are not handled gracefully as they are in a pure `and` query. For instance the query -`fast fox` is analyzed into 3 terms `[fast, quick, fox]` where `quick` is a synonym -for `fast` on the same token positions the query might require `fast` and `quick` to -match if the operator is `and`. - Here is an example showing a query composed of stopwords exclusivly: [source,js] diff --git a/src/main/java/org/elasticsearch/index/search/MatchQuery.java b/src/main/java/org/elasticsearch/index/search/MatchQuery.java index c9ab99e3a80..b7764161027 100644 --- a/src/main/java/org/elasticsearch/index/search/MatchQuery.java +++ b/src/main/java/org/elasticsearch/index/search/MatchQuery.java @@ -279,7 +279,7 @@ public class MatchQuery { } public Query createCommonTermsQuery(String field, String queryText, Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency, FieldMapper mapper) { - Query booleanQuery = createBooleanQuery(field, queryText, Occur.SHOULD); + Query booleanQuery = createBooleanQuery(field, queryText, lowFreqOccur); if (booleanQuery != null && booleanQuery instanceof BooleanQuery) { BooleanQuery bq = (BooleanQuery) booleanQuery; ExtendedCommonTermsQuery query = new ExtendedCommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, ((BooleanQuery)booleanQuery).isCoordDisabled(), mapper); diff --git a/src/test/java/org/elasticsearch/search/query/SimpleQueryTests.java b/src/test/java/org/elasticsearch/search/query/SimpleQueryTests.java index 002ebeca180..124cdd902fe 100644 --- a/src/test/java/org/elasticsearch/search/query/SimpleQueryTests.java +++ b/src/test/java/org/elasticsearch/search/query/SimpleQueryTests.java @@ -321,7 +321,7 @@ public class SimpleQueryTests extends ElasticsearchIntegrationTest { searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the quick brown").cutoffFrequency(3).analyzer("stop")).get(); assertHitCount(searchResponse, 3l); - // standard drops "the" since its a stopword + // stop drops "the" since its a stopword assertFirstHit(searchResponse, hasId("1")); assertSecondHit(searchResponse, hasId("3")); assertThirdHit(searchResponse, hasId("2")); @@ -340,7 +340,7 @@ public class SimpleQueryTests extends ElasticsearchIntegrationTest { searchResponse = client().prepareSearch().setQuery(matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND).analyzer("stop")).get(); assertHitCount(searchResponse, 3l); - // standard drops "the" since its a stopword + // stop drops "the" since its a stopword assertFirstHit(searchResponse, hasId("1")); assertSecondHit(searchResponse, hasId("3")); assertThirdHit(searchResponse, hasId("2")); @@ -353,6 +353,104 @@ public class SimpleQueryTests extends ElasticsearchIntegrationTest { assertThirdHit(searchResponse, hasId("2")); } + @Test + public void testCommonTermsQueryStackedTokens() throws Exception { + assertAcked(prepareCreate("test") + .setSettings(settingsBuilder() + .put(indexSettings()) + .put(SETTING_NUMBER_OF_SHARDS,1) + .put("index.analysis.filter.syns.type","synonym") + .putArray("index.analysis.filter.syns.synonyms","quick,fast") + .put("index.analysis.analyzer.syns.tokenizer","whitespace") + .put("index.analysis.analyzer.syns.filter","syns") + ) + .addMapping("type1", "field1", "type=string,analyzer=syns", "field2", "type=string,analyzer=syns")); + ensureGreen(); + + indexRandom(true, client().prepareIndex("test", "type1", "3").setSource("field1", "quick lazy huge brown pidgin", "field2", "the quick lazy huge brown fox jumps over the tree"), + client().prepareIndex("test", "type1", "1").setSource("field1", "the quick brown fox"), + client().prepareIndex("test", "type1", "2").setSource("field1", "the quick lazy huge brown fox jumps over the tree") ); + + SearchResponse searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the fast brown").cutoffFrequency(3).lowFreqOperator(Operator.OR)).get(); + assertHitCount(searchResponse, 3l); + assertFirstHit(searchResponse, hasId("1")); + assertSecondHit(searchResponse, hasId("2")); + assertThirdHit(searchResponse, hasId("3")); + + searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the fast brown").cutoffFrequency(3).lowFreqOperator(Operator.AND)).get(); + assertThat(searchResponse.getHits().totalHits(), equalTo(2l)); + assertFirstHit(searchResponse, hasId("1")); + assertSecondHit(searchResponse, hasId("2")); + + // Default + searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the fast brown").cutoffFrequency(3)).get(); + assertHitCount(searchResponse, 3l); + assertFirstHit(searchResponse, hasId("1")); + assertSecondHit(searchResponse, hasId("2")); + assertThirdHit(searchResponse, hasId("3")); + + + searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the fast huge fox").lowFreqMinimumShouldMatch("3")).get(); + assertHitCount(searchResponse, 1l); + assertFirstHit(searchResponse, hasId("2")); + + searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the fast lazy fox brown").cutoffFrequency(1).highFreqMinimumShouldMatch("5")).get(); + assertHitCount(searchResponse, 2l); + assertFirstHit(searchResponse, hasId("1")); + assertSecondHit(searchResponse, hasId("2")); + + searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the fast lazy fox brown").cutoffFrequency(1).highFreqMinimumShouldMatch("6")).get(); + assertHitCount(searchResponse, 1l); + assertFirstHit(searchResponse, hasId("2")); + + searchResponse = client().prepareSearch().setQuery("{ \"common\" : { \"field1\" : { \"query\" : \"the fast lazy fox brown\", \"cutoff_frequency\" : 1, \"minimum_should_match\" : { \"high_freq\" : 6 } } } }").get(); + assertHitCount(searchResponse, 1l); + assertFirstHit(searchResponse, hasId("2")); + + // Default + searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the fast lazy fox brown").cutoffFrequency(1)).get(); + assertHitCount(searchResponse, 1l); + assertFirstHit(searchResponse, hasId("2")); + + searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the quick brown").cutoffFrequency(3).analyzer("stop")).get(); + assertHitCount(searchResponse, 3l); + // stop drops "the" since its a stopword + assertFirstHit(searchResponse, hasId("1")); + assertSecondHit(searchResponse, hasId("3")); + assertThirdHit(searchResponse, hasId("2")); + + // try the same with match query + searchResponse = client().prepareSearch().setQuery(matchQuery("field1", "the fast brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND)).get(); + assertHitCount(searchResponse, 2l); + assertFirstHit(searchResponse, hasId("1")); + assertSecondHit(searchResponse, hasId("2")); + + searchResponse = client().prepareSearch().setQuery(matchQuery("field1", "the fast brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.OR)).get(); + assertHitCount(searchResponse, 3l); + assertFirstHit(searchResponse, hasId("1")); + assertSecondHit(searchResponse, hasId("2")); + assertThirdHit(searchResponse, hasId("3")); + + searchResponse = client().prepareSearch().setQuery(matchQuery("field1", "the fast brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND).analyzer("stop")).get(); + assertHitCount(searchResponse, 3l); + // stop drops "the" since its a stopword + assertFirstHit(searchResponse, hasId("1")); + assertSecondHit(searchResponse, hasId("3")); + assertThirdHit(searchResponse, hasId("2")); + + searchResponse = client().prepareSearch().setQuery(matchQuery("field1", "the fast brown").cutoffFrequency(3).minimumShouldMatch("3")).get(); + assertHitCount(searchResponse, 2l); + assertFirstHit(searchResponse, hasId("1")); + assertSecondHit(searchResponse, hasId("2")); + + // try the same with multi match query + searchResponse = client().prepareSearch().setQuery(multiMatchQuery("the fast brown", "field1", "field2").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND)).get(); + assertHitCount(searchResponse, 3l); + assertFirstHit(searchResponse, hasId("3")); // better score due to different query stats + assertSecondHit(searchResponse, hasId("1")); + assertThirdHit(searchResponse, hasId("2")); + } + @Test public void testOmitTermFreqsAndPositions() throws Exception { Version version = Version.CURRENT;