Match query with operator and, cutoff_frequency and stacked tokens

If the match query with cutoff_frequency encounters stacked tokens,
like synonyms in the same position, it returns a boolean query instead
of a common terms query.  However, if the original operator was set
to "and", it was ignoring that and resetting the operator to "or".

In fact, if operator is "and" then there is little benefit in using
a common terms query as a must query is already
executed efficiently.
This commit is contained in:
Clinton Gormley 2014-06-20 12:42:43 +02:00
parent 534b07a3fb
commit 30c80319c0
3 changed files with 101 additions and 10 deletions

View File

@ -98,13 +98,6 @@ The `cutoff_frequency` can either be relative to the number of documents
in the index if in the range `[0..1)` or absolute if greater or equal to in the index if in the range `[0..1)` or absolute if greater or equal to
`1.0`. `1.0`.
Note: If the `cutoff_frequency` is used and the operator is `and`
_stacked tokens_ (tokens that are on the same position like `synonym` filter emits)
are not handled gracefully as they are in a pure `and` query. For instance the query
`fast fox` is analyzed into 3 terms `[fast, quick, fox]` where `quick` is a synonym
for `fast` on the same token positions the query might require `fast` and `quick` to
match if the operator is `and`.
Here is an example showing a query composed of stopwords exclusivly: Here is an example showing a query composed of stopwords exclusivly:
[source,js] [source,js]

View File

@ -279,7 +279,7 @@ public class MatchQuery {
} }
public Query createCommonTermsQuery(String field, String queryText, Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency, FieldMapper<?> mapper) { public Query createCommonTermsQuery(String field, String queryText, Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency, FieldMapper<?> mapper) {
Query booleanQuery = createBooleanQuery(field, queryText, Occur.SHOULD); Query booleanQuery = createBooleanQuery(field, queryText, lowFreqOccur);
if (booleanQuery != null && booleanQuery instanceof BooleanQuery) { if (booleanQuery != null && booleanQuery instanceof BooleanQuery) {
BooleanQuery bq = (BooleanQuery) booleanQuery; BooleanQuery bq = (BooleanQuery) booleanQuery;
ExtendedCommonTermsQuery query = new ExtendedCommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, ((BooleanQuery)booleanQuery).isCoordDisabled(), mapper); ExtendedCommonTermsQuery query = new ExtendedCommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, ((BooleanQuery)booleanQuery).isCoordDisabled(), mapper);

View File

@ -321,7 +321,7 @@ public class SimpleQueryTests extends ElasticsearchIntegrationTest {
searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the quick brown").cutoffFrequency(3).analyzer("stop")).get(); searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the quick brown").cutoffFrequency(3).analyzer("stop")).get();
assertHitCount(searchResponse, 3l); assertHitCount(searchResponse, 3l);
// standard drops "the" since its a stopword // stop drops "the" since its a stopword
assertFirstHit(searchResponse, hasId("1")); assertFirstHit(searchResponse, hasId("1"));
assertSecondHit(searchResponse, hasId("3")); assertSecondHit(searchResponse, hasId("3"));
assertThirdHit(searchResponse, hasId("2")); assertThirdHit(searchResponse, hasId("2"));
@ -340,7 +340,7 @@ public class SimpleQueryTests extends ElasticsearchIntegrationTest {
searchResponse = client().prepareSearch().setQuery(matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND).analyzer("stop")).get(); searchResponse = client().prepareSearch().setQuery(matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND).analyzer("stop")).get();
assertHitCount(searchResponse, 3l); assertHitCount(searchResponse, 3l);
// standard drops "the" since its a stopword // stop drops "the" since its a stopword
assertFirstHit(searchResponse, hasId("1")); assertFirstHit(searchResponse, hasId("1"));
assertSecondHit(searchResponse, hasId("3")); assertSecondHit(searchResponse, hasId("3"));
assertThirdHit(searchResponse, hasId("2")); assertThirdHit(searchResponse, hasId("2"));
@ -353,6 +353,104 @@ public class SimpleQueryTests extends ElasticsearchIntegrationTest {
assertThirdHit(searchResponse, hasId("2")); assertThirdHit(searchResponse, hasId("2"));
} }
@Test
public void testCommonTermsQueryStackedTokens() throws Exception {
assertAcked(prepareCreate("test")
.setSettings(settingsBuilder()
.put(indexSettings())
.put(SETTING_NUMBER_OF_SHARDS,1)
.put("index.analysis.filter.syns.type","synonym")
.putArray("index.analysis.filter.syns.synonyms","quick,fast")
.put("index.analysis.analyzer.syns.tokenizer","whitespace")
.put("index.analysis.analyzer.syns.filter","syns")
)
.addMapping("type1", "field1", "type=string,analyzer=syns", "field2", "type=string,analyzer=syns"));
ensureGreen();
indexRandom(true, client().prepareIndex("test", "type1", "3").setSource("field1", "quick lazy huge brown pidgin", "field2", "the quick lazy huge brown fox jumps over the tree"),
client().prepareIndex("test", "type1", "1").setSource("field1", "the quick brown fox"),
client().prepareIndex("test", "type1", "2").setSource("field1", "the quick lazy huge brown fox jumps over the tree") );
SearchResponse searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the fast brown").cutoffFrequency(3).lowFreqOperator(Operator.OR)).get();
assertHitCount(searchResponse, 3l);
assertFirstHit(searchResponse, hasId("1"));
assertSecondHit(searchResponse, hasId("2"));
assertThirdHit(searchResponse, hasId("3"));
searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the fast brown").cutoffFrequency(3).lowFreqOperator(Operator.AND)).get();
assertThat(searchResponse.getHits().totalHits(), equalTo(2l));
assertFirstHit(searchResponse, hasId("1"));
assertSecondHit(searchResponse, hasId("2"));
// Default
searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the fast brown").cutoffFrequency(3)).get();
assertHitCount(searchResponse, 3l);
assertFirstHit(searchResponse, hasId("1"));
assertSecondHit(searchResponse, hasId("2"));
assertThirdHit(searchResponse, hasId("3"));
searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the fast huge fox").lowFreqMinimumShouldMatch("3")).get();
assertHitCount(searchResponse, 1l);
assertFirstHit(searchResponse, hasId("2"));
searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the fast lazy fox brown").cutoffFrequency(1).highFreqMinimumShouldMatch("5")).get();
assertHitCount(searchResponse, 2l);
assertFirstHit(searchResponse, hasId("1"));
assertSecondHit(searchResponse, hasId("2"));
searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the fast lazy fox brown").cutoffFrequency(1).highFreqMinimumShouldMatch("6")).get();
assertHitCount(searchResponse, 1l);
assertFirstHit(searchResponse, hasId("2"));
searchResponse = client().prepareSearch().setQuery("{ \"common\" : { \"field1\" : { \"query\" : \"the fast lazy fox brown\", \"cutoff_frequency\" : 1, \"minimum_should_match\" : { \"high_freq\" : 6 } } } }").get();
assertHitCount(searchResponse, 1l);
assertFirstHit(searchResponse, hasId("2"));
// Default
searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the fast lazy fox brown").cutoffFrequency(1)).get();
assertHitCount(searchResponse, 1l);
assertFirstHit(searchResponse, hasId("2"));
searchResponse = client().prepareSearch().setQuery(commonTerms("field1", "the quick brown").cutoffFrequency(3).analyzer("stop")).get();
assertHitCount(searchResponse, 3l);
// stop drops "the" since its a stopword
assertFirstHit(searchResponse, hasId("1"));
assertSecondHit(searchResponse, hasId("3"));
assertThirdHit(searchResponse, hasId("2"));
// try the same with match query
searchResponse = client().prepareSearch().setQuery(matchQuery("field1", "the fast brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND)).get();
assertHitCount(searchResponse, 2l);
assertFirstHit(searchResponse, hasId("1"));
assertSecondHit(searchResponse, hasId("2"));
searchResponse = client().prepareSearch().setQuery(matchQuery("field1", "the fast brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.OR)).get();
assertHitCount(searchResponse, 3l);
assertFirstHit(searchResponse, hasId("1"));
assertSecondHit(searchResponse, hasId("2"));
assertThirdHit(searchResponse, hasId("3"));
searchResponse = client().prepareSearch().setQuery(matchQuery("field1", "the fast brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND).analyzer("stop")).get();
assertHitCount(searchResponse, 3l);
// stop drops "the" since its a stopword
assertFirstHit(searchResponse, hasId("1"));
assertSecondHit(searchResponse, hasId("3"));
assertThirdHit(searchResponse, hasId("2"));
searchResponse = client().prepareSearch().setQuery(matchQuery("field1", "the fast brown").cutoffFrequency(3).minimumShouldMatch("3")).get();
assertHitCount(searchResponse, 2l);
assertFirstHit(searchResponse, hasId("1"));
assertSecondHit(searchResponse, hasId("2"));
// try the same with multi match query
searchResponse = client().prepareSearch().setQuery(multiMatchQuery("the fast brown", "field1", "field2").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND)).get();
assertHitCount(searchResponse, 3l);
assertFirstHit(searchResponse, hasId("3")); // better score due to different query stats
assertSecondHit(searchResponse, hasId("1"));
assertThirdHit(searchResponse, hasId("2"));
}
@Test @Test
public void testOmitTermFreqsAndPositions() throws Exception { public void testOmitTermFreqsAndPositions() throws Exception {
Version version = Version.CURRENT; Version version = Version.CURRENT;