percolator: Add support to extract terms from several types of span queries

This commit is contained in:
Martijn van Groningen 2016-03-24 15:02:42 +01:00
parent 26a0fb37a4
commit 7600dc9943
4 changed files with 163 additions and 2 deletions

View File

@ -37,6 +37,16 @@ import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
import org.apache.lucene.search.spans.SpanContainingQuery;
import org.apache.lucene.search.spans.SpanFirstQuery;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanWithinQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.elasticsearch.common.logging.LoggerMessageFormat;
@ -95,7 +105,6 @@ public final class ExtractQueryTermsService {
* an UnsupportedQueryException is thrown.
*/
static Set<Term> extractQueryTerms(Query query) {
// TODO: add support for span queries
if (query instanceof TermQuery) {
return Collections.singleton(((TermQuery) query).getTerm());
} else if (query instanceof TermsQuery) {
@ -170,6 +179,27 @@ public final class ExtractQueryTermsService {
} else if (query instanceof BlendedTermQuery) {
List<Term> terms = ((BlendedTermQuery) query).getTerms();
return new HashSet<>(terms);
} else if (query instanceof SpanTermQuery) {
return Collections.singleton(((SpanTermQuery) query).getTerm());
} else if (query instanceof SpanNearQuery) {
Set<Term> bestClause = null;
SpanNearQuery spanNearQuery = (SpanNearQuery) query;
for (SpanQuery clause : spanNearQuery.getClauses()) {
Set<Term> temp = extractQueryTerms(clause);
bestClause = selectTermListWithTheLongestShortestTerm(temp, bestClause);
}
return bestClause;
} else if (query instanceof SpanOrQuery) {
Set<Term> terms = new HashSet<>();
SpanOrQuery spanOrQuery = (SpanOrQuery) query;
for (SpanQuery clause : spanOrQuery.getClauses()) {
terms.addAll(extractQueryTerms(clause));
}
return terms;
} else if (query instanceof SpanFirstQuery) {
return extractQueryTerms(((SpanFirstQuery)query).getMatch());
} else if (query instanceof SpanNotQuery) {
return extractQueryTerms(((SpanNotQuery) query).getInclude());
} else {
throw new UnsupportedQueryException(query);
}

View File

@ -35,12 +35,19 @@ import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
import org.apache.lucene.search.spans.SpanFirstQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.test.ESTestCase;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
@ -242,6 +249,56 @@ public class ExtractQueryTermsServiceTests extends ESTestCase {
assertThat(result.get(1).text(), equalTo("_term2"));
}
public void testExtractQueryMetadata_spanTermQuery() {
// the following span queries aren't exposed in the query dsl and are therefor not supported:
// 1) SpanPositionRangeQuery
// 2) PayloadScoreQuery
// 3) SpanBoostQuery
// The following span queries can't be supported because of how these queries work:
// 1) SpanMultiTermQueryWrapper, not supported, because there is no support for MTQ typed queries yet.
// 2) SpanContainingQuery, is kind of range of spans and we don't know what is between the little and big terms
// 3) SpanWithinQuery, same reason as SpanContainingQuery
// 4) FieldMaskingSpanQuery is a tricky query so we shouldn't optimize this
SpanTermQuery spanTermQuery1 = new SpanTermQuery(new Term("_field", "_short_term"));
Set<Term> terms = ExtractQueryTermsService.extractQueryTerms(spanTermQuery1);
assertTermsEqual(terms, spanTermQuery1.getTerm());
}
public void testExtractQueryMetadata_spanNearQuery() {
SpanTermQuery spanTermQuery1 = new SpanTermQuery(new Term("_field", "_short_term"));
SpanTermQuery spanTermQuery2 = new SpanTermQuery(new Term("_field", "_very_long_term"));
SpanNearQuery spanNearQuery = new SpanNearQuery.Builder("_field", true)
.addClause(spanTermQuery1).addClause(spanTermQuery2).build();
Set<Term> terms = ExtractQueryTermsService.extractQueryTerms(spanNearQuery);
assertTermsEqual(terms, spanTermQuery2.getTerm());
}
public void testExtractQueryMetadata_spanOrQuery() {
SpanTermQuery spanTermQuery1 = new SpanTermQuery(new Term("_field", "_short_term"));
SpanTermQuery spanTermQuery2 = new SpanTermQuery(new Term("_field", "_very_long_term"));
SpanOrQuery spanOrQuery = new SpanOrQuery(spanTermQuery1, spanTermQuery2);
Set<Term> terms = ExtractQueryTermsService.extractQueryTerms(spanOrQuery);
assertTermsEqual(terms, spanTermQuery1.getTerm(), spanTermQuery2.getTerm());
}
public void testExtractQueryMetadata_spanFirstQuery() {
SpanTermQuery spanTermQuery1 = new SpanTermQuery(new Term("_field", "_short_term"));
SpanTermQuery spanTermQuery2 = new SpanTermQuery(new Term("_field", "_very_long_term"));
SpanFirstQuery spanFirstQuery = new SpanFirstQuery(spanTermQuery1, 20);
Set<Term> terms = ExtractQueryTermsService.extractQueryTerms(spanFirstQuery);
assertTermsEqual(terms, spanTermQuery1.getTerm());
}
public void testExtractQueryMetadata_spanNotQuery() {
SpanTermQuery spanTermQuery1 = new SpanTermQuery(new Term("_field", "_short_term"));
SpanTermQuery spanTermQuery2 = new SpanTermQuery(new Term("_field", "_very_long_term"));
SpanNotQuery spanNotQuery = new SpanNotQuery(spanTermQuery1, spanTermQuery2);
Set<Term> terms = ExtractQueryTermsService.extractQueryTerms(spanNotQuery);
assertTermsEqual(terms, spanTermQuery1.getTerm());
}
public void testExtractQueryMetadata_unsupportedQuery() {
TermRangeQuery termRangeQuery = new TermRangeQuery("_field", null, null, true, false);
@ -330,4 +387,8 @@ public class ExtractQueryTermsServiceTests extends ESTestCase {
assertThat(((TermQuery) booleanQuery.clauses().get(i).getQuery()).getTerm().bytes().utf8ToString(), equalTo(expectedValue));
}
private static void assertTermsEqual(Set<Term> actual, Term... expected) {
assertEquals(new HashSet<>(Arrays.asList(expected)), actual);
}
}

View File

@ -45,6 +45,10 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.index.mapper.ParseContext;
@ -194,6 +198,8 @@ public class PercolatorQueryTests extends ESTestCase {
query = new WildcardQuery(new Term("field", id + "*"));
} else if (randomBoolean()) {
query = new CustomQuery(new Term("field", id + "*"));
} else if (randomBoolean()) {
query = new SpanTermQuery(new Term("field", id));
} else {
query = new TermQuery(new Term("field", id));
}
@ -223,6 +229,27 @@ public class PercolatorQueryTests extends ESTestCase {
new Term("field", "brown"), new Term("field", "fox")}, false);
addPercolatorQuery("_id2", blendedTermQuery);
SpanNearQuery spanNearQuery = new SpanNearQuery.Builder("field", true)
.addClause(new SpanTermQuery(new Term("field", "quick")))
.addClause(new SpanTermQuery(new Term("field", "brown")))
.addClause(new SpanTermQuery(new Term("field", "fox")))
.build();
addPercolatorQuery("_id3", spanNearQuery);
SpanNearQuery spanNearQuery2 = new SpanNearQuery.Builder("field", true)
.addClause(new SpanTermQuery(new Term("field", "the")))
.addClause(new SpanTermQuery(new Term("field", "lazy")))
.addClause(new SpanTermQuery(new Term("field", "doc")))
.build();
SpanOrQuery spanOrQuery = new SpanOrQuery(
spanNearQuery,
spanNearQuery2
);
addPercolatorQuery("_id4", spanOrQuery);
SpanNotQuery spanNotQuery = new SpanNotQuery(spanNearQuery, spanNearQuery);
addPercolatorQuery("_id5", spanNotQuery);
indexWriter.close();
directoryReader = DirectoryReader.open(directory);
IndexSearcher shardSearcher = newSearcher(directoryReader);

View File

@ -33,6 +33,9 @@ import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery;
import static org.elasticsearch.index.query.QueryBuilders.percolatorQuery;
import static org.elasticsearch.index.query.QueryBuilders.spanNearQuery;
import static org.elasticsearch.index.query.QueryBuilders.spanNotQuery;
import static org.elasticsearch.index.query.QueryBuilders.spanTermQuery;
import static org.elasticsearch.index.query.QueryBuilders.termQuery;
import static org.hamcrest.Matchers.equalTo;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
@ -99,6 +102,44 @@ public class PercolatorQuerySearchIT extends ESSingleNodeTestCase {
.setSource(jsonBuilder().startObject().field("query", multiMatchQuery("quick brown fox", "field1", "field2")
.type(MultiMatchQueryBuilder.Type.CROSS_FIELDS)).endObject())
.get();
client().prepareIndex("test", PercolatorFieldMapper.TYPE_NAME, "3")
.setSource(jsonBuilder().startObject().field("query",
spanNearQuery(spanTermQuery("field1", "quick"), 0)
.clause(spanTermQuery("field1", "brown"))
.clause(spanTermQuery("field1", "fox"))
.inOrder(true)
).endObject())
.get();
client().admin().indices().prepareRefresh().get();
client().prepareIndex("test", PercolatorFieldMapper.TYPE_NAME, "4")
.setSource(jsonBuilder().startObject().field("query",
spanNotQuery(
spanNearQuery(spanTermQuery("field1", "quick"), 0)
.clause(spanTermQuery("field1", "brown"))
.clause(spanTermQuery("field1", "fox"))
.inOrder(true),
spanNearQuery(spanTermQuery("field1", "the"), 0)
.clause(spanTermQuery("field1", "lazy"))
.clause(spanTermQuery("field1", "dog"))
.inOrder(true)).dist(2)
).endObject())
.get();
// doesn't match
client().prepareIndex("test", PercolatorFieldMapper.TYPE_NAME, "5")
.setSource(jsonBuilder().startObject().field("query",
spanNotQuery(
spanNearQuery(spanTermQuery("field1", "quick"), 0)
.clause(spanTermQuery("field1", "brown"))
.clause(spanTermQuery("field1", "fox"))
.inOrder(true),
spanNearQuery(spanTermQuery("field1", "the"), 0)
.clause(spanTermQuery("field1", "lazy"))
.clause(spanTermQuery("field1", "dog"))
.inOrder(true)).dist(3)
).endObject())
.get();
client().admin().indices().prepareRefresh().get();
BytesReference source = jsonBuilder().startObject()
@ -108,9 +149,11 @@ public class PercolatorQuerySearchIT extends ESSingleNodeTestCase {
SearchResponse response = client().prepareSearch()
.setQuery(percolatorQuery("type", source))
.get();
assertHitCount(response, 2);
assertHitCount(response, 4);
assertThat(response.getHits().getAt(0).getId(), equalTo("1"));
assertThat(response.getHits().getAt(1).getId(), equalTo("2"));
assertThat(response.getHits().getAt(2).getId(), equalTo("3"));
assertThat(response.getHits().getAt(3).getId(), equalTo("4"));
}
public void testPercolatorQueryWithHighlighting() throws Exception {