mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-26 14:54:56 +00:00
parent
c4eab90b2e
commit
4eefcb9c82
@ -143,7 +143,7 @@ public class Queries {
|
||||
private static Pattern spacePattern = Pattern.compile(" ");
|
||||
private static Pattern lessThanPattern = Pattern.compile("<");
|
||||
|
||||
static int calculateMinShouldMatch(int optionalClauseCount, String spec) {
|
||||
public static int calculateMinShouldMatch(int optionalClauseCount, String spec) {
|
||||
int result = optionalClauseCount;
|
||||
spec = spec.trim();
|
||||
|
||||
|
@ -0,0 +1,169 @@
|
||||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.query;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
|
||||
/**
|
||||
* CommonTermsQuery query is a query that executes high-frequency terms in a
|
||||
* optional sub-query to prevent slow queries due to "common" terms like
|
||||
* stopwords. This query basically builds 2 queries off the {@link #add(Term)
|
||||
* added} terms where low-frequency terms are added to a required boolean clause
|
||||
* and high-frequency terms are added to an optional boolean clause. The
|
||||
* optional clause is only executed if the required "low-frequency' clause
|
||||
* matches. Scores produced by this query will be slightly different to plain
|
||||
* {@link BooleanQuery} scorer mainly due to differences in the
|
||||
* {@link Similarity#coord(int,int) number of leave queries} in the required
|
||||
* boolean clause. In the most cases high-frequency terms are unlikely to
|
||||
* significantly contribute to the document score unless at least one of the
|
||||
* low-frequency terms are matched such that this query can improve query
|
||||
* execution times significantly if applicable.
|
||||
* <p>
|
||||
*/
|
||||
public class CommonTermsQueryBuilder extends BaseQueryBuilder implements BoostableQueryBuilder<CommonTermsQueryBuilder> {
|
||||
|
||||
public static enum Operator {
|
||||
OR, AND
|
||||
}
|
||||
|
||||
private final String name;
|
||||
|
||||
private final Object text;
|
||||
|
||||
private Operator highFreqOperator = null;
|
||||
|
||||
private Operator lowFreqOperator = null;
|
||||
|
||||
private String analyzer = null;
|
||||
|
||||
private Float boost = null;
|
||||
|
||||
private String minimumShouldMatch = null;
|
||||
|
||||
private Boolean disableCoords = null;
|
||||
|
||||
private Float cutoffFrequency = null;
|
||||
|
||||
/**
|
||||
* Constructs a new common terms query.
|
||||
*/
|
||||
public CommonTermsQueryBuilder(String name, Object text) {
|
||||
if (name == null) {
|
||||
throw new ElasticSearchIllegalArgumentException("Field name must not be null");
|
||||
}
|
||||
if (text == null) {
|
||||
throw new ElasticSearchIllegalArgumentException("Query must not be null");
|
||||
}
|
||||
this.text = text;
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the operator to use for terms with a high document frequency
|
||||
* (greater than or equal to {@link #cutoffFrequency(float)}. Defaults to
|
||||
* <tt>AND</tt>.
|
||||
*/
|
||||
public CommonTermsQueryBuilder highFreqOperator(Operator operator) {
|
||||
this.highFreqOperator = operator;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the operator to use for terms with a low document frequency (less
|
||||
* than {@link #cutoffFrequency(float)}. Defaults to <tt>AND</tt>.
|
||||
*/
|
||||
public CommonTermsQueryBuilder lowFreqOperator(Operator operator) {
|
||||
this.lowFreqOperator = operator;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Explicitly set the analyzer to use. Defaults to use explicit mapping
|
||||
* config for the field, or, if not set, the default search analyzer.
|
||||
*/
|
||||
public CommonTermsQueryBuilder analyzer(String analyzer) {
|
||||
this.analyzer = analyzer;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the boost to apply to the query.
|
||||
*/
|
||||
public CommonTermsQueryBuilder boost(float boost) {
|
||||
this.boost = boost;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the cutoff document frequency for high / low frequent terms. A value
|
||||
* in [0..1] (or absolute number >=1) representing the maximum threshold of
|
||||
* a terms document frequency to be considered a low frequency term.
|
||||
* Defaults to
|
||||
* <tt>{@value CommonTermsQueryParser#DEFAULT_MAX_TERM_DOC_FREQ}</tt>
|
||||
*/
|
||||
public CommonTermsQueryBuilder cutoffFrequency(float cutoffFrequency) {
|
||||
this.cutoffFrequency = cutoffFrequency;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the minimum number of query terms that need to match in order to
|
||||
* produce a hit.
|
||||
*/
|
||||
public CommonTermsQueryBuilder minimumShouldMatch(String minimumShouldMatch) {
|
||||
this.minimumShouldMatch = minimumShouldMatch;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void doXContent(XContentBuilder builder, Params params) throws IOException {
|
||||
builder.startObject(CommonTermsQueryParser.NAME);
|
||||
builder.startObject(name);
|
||||
|
||||
builder.field("query", text);
|
||||
if (disableCoords != null) {
|
||||
builder.field("disable_coords", disableCoords);
|
||||
}
|
||||
if (highFreqOperator != null) {
|
||||
builder.field("high_freq_operator", highFreqOperator.toString());
|
||||
}
|
||||
if (lowFreqOperator != null) {
|
||||
builder.field("low_freq_operator", lowFreqOperator.toString());
|
||||
}
|
||||
if (analyzer != null) {
|
||||
builder.field("analyzer", analyzer);
|
||||
}
|
||||
if (boost != null) {
|
||||
builder.field("boost", boost);
|
||||
}
|
||||
if (cutoffFrequency != null) {
|
||||
builder.field("cutoff_frequency", cutoffFrequency);
|
||||
}
|
||||
if (minimumShouldMatch != null) {
|
||||
builder.field("minimum_should_match", minimumShouldMatch);
|
||||
}
|
||||
|
||||
builder.endObject();
|
||||
builder.endObject();
|
||||
}
|
||||
}
|
@ -0,0 +1,203 @@
|
||||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.query;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.CommonTermsQuery;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.io.FastStringReader;
|
||||
import org.elasticsearch.common.lucene.search.Queries;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.index.mapper.FieldMapper;
|
||||
import org.elasticsearch.index.mapper.MapperService;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class CommonTermsQueryParser implements QueryParser {
|
||||
|
||||
public static final String NAME = "common";
|
||||
|
||||
static final float DEFAULT_MAX_TERM_DOC_FREQ = 0.01f;
|
||||
|
||||
static final Occur DEFAULT_HIGH_FREQ_OCCUR = Occur.MUST;
|
||||
|
||||
static final Occur DEFAULT_LOW_FREQ_OCCUR = Occur.MUST;
|
||||
|
||||
static final boolean DEFAULT_DISABLE_COORDS = true;
|
||||
|
||||
|
||||
@Inject
|
||||
public CommonTermsQueryParser() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] names() {
|
||||
return new String[] { NAME };
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query parse(QueryParseContext parseContext) throws IOException, QueryParsingException {
|
||||
XContentParser parser = parseContext.parser();
|
||||
XContentParser.Token token = parser.nextToken();
|
||||
if (token != XContentParser.Token.FIELD_NAME) {
|
||||
throw new QueryParsingException(parseContext.index(), "[common] query malformed, no field");
|
||||
}
|
||||
String fieldName = parser.currentName();
|
||||
Object value = null;
|
||||
float boost = 1.0f;
|
||||
String queryAnalyzer = null;
|
||||
String minimumShouldMatch = null;
|
||||
boolean disableCoords = DEFAULT_DISABLE_COORDS;
|
||||
Occur highFreqOccur = DEFAULT_HIGH_FREQ_OCCUR;
|
||||
Occur lowFreqOccur = DEFAULT_HIGH_FREQ_OCCUR;
|
||||
float maxTermFrequency = DEFAULT_MAX_TERM_DOC_FREQ;
|
||||
token = parser.nextToken();
|
||||
if (token == XContentParser.Token.START_OBJECT) {
|
||||
String currentFieldName = null;
|
||||
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
|
||||
if (token == XContentParser.Token.FIELD_NAME) {
|
||||
currentFieldName = parser.currentName();
|
||||
} else if (token.isValue()) {
|
||||
if ("query".equals(currentFieldName)) {
|
||||
value = parser.objectText();
|
||||
} else if ("analyzer".equals(currentFieldName)) {
|
||||
String analyzer = parser.text();
|
||||
if (parseContext.analysisService().analyzer(analyzer) == null) {
|
||||
throw new QueryParsingException(parseContext.index(), "[common] analyzer [" + parser.text() + "] not found");
|
||||
}
|
||||
queryAnalyzer = analyzer;
|
||||
} else if ("disable_coords".equals(currentFieldName)) {
|
||||
disableCoords = parser.booleanValue();
|
||||
} else if ("boost".equals(currentFieldName)) {
|
||||
boost = parser.floatValue();
|
||||
} else if ("high_freq_operator".equals(currentFieldName)) {
|
||||
String op = parser.text();
|
||||
if ("or".equalsIgnoreCase(op)) {
|
||||
highFreqOccur = BooleanClause.Occur.SHOULD;
|
||||
} else if ("and".equalsIgnoreCase(op)) {
|
||||
highFreqOccur = BooleanClause.Occur.MUST;
|
||||
} else {
|
||||
throw new QueryParsingException(parseContext.index(),
|
||||
"[common] query requires operator to be either 'and' or 'or', not [" + op + "]");
|
||||
}
|
||||
} else if ("low_freq_operator".equals(currentFieldName)) {
|
||||
String op = parser.text();
|
||||
if ("or".equalsIgnoreCase(op)) {
|
||||
lowFreqOccur = BooleanClause.Occur.SHOULD;
|
||||
} else if ("and".equalsIgnoreCase(op)) {
|
||||
lowFreqOccur = BooleanClause.Occur.MUST;
|
||||
} else {
|
||||
throw new QueryParsingException(parseContext.index(),
|
||||
"[common] query requires operator to be either 'and' or 'or', not [" + op + "]");
|
||||
}
|
||||
} else if ("minimum_should_match".equals(currentFieldName) || "minimumShouldMatch".equals(currentFieldName)) {
|
||||
minimumShouldMatch = parser.textOrNull();
|
||||
} else if ("cutoff_frequency".equals(currentFieldName)) {
|
||||
maxTermFrequency = parser.floatValue();
|
||||
} else {
|
||||
throw new QueryParsingException(parseContext.index(), "[common] query does not support [" + currentFieldName + "]");
|
||||
}
|
||||
}
|
||||
}
|
||||
parser.nextToken();
|
||||
} else {
|
||||
value = parser.objectText();
|
||||
// move to the next token
|
||||
token = parser.nextToken();
|
||||
if (token != XContentParser.Token.END_OBJECT) {
|
||||
throw new QueryParsingException(
|
||||
parseContext.index(),
|
||||
"[common] query parsed in simplified form, with direct field name, but included more options than just the field name, possibly use its 'options' form, with 'query' element?");
|
||||
}
|
||||
}
|
||||
|
||||
if (value == null) {
|
||||
throw new QueryParsingException(parseContext.index(), "No text specified for text query");
|
||||
}
|
||||
CommonTermsQuery query = new CommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoords);
|
||||
int numTerms = parseQueryString(query, value.toString(), fieldName, parseContext, queryAnalyzer);
|
||||
if (numTerms == 0) {
|
||||
return null;
|
||||
}
|
||||
if (minimumShouldMatch != null) {
|
||||
query.setMinimumNumberShouldMatch(Queries.calculateMinShouldMatch(numTerms, minimumShouldMatch));
|
||||
}
|
||||
query.setBoost(boost);
|
||||
return query;
|
||||
}
|
||||
|
||||
private final int parseQueryString(CommonTermsQuery query, String queryString, String fieldName, QueryParseContext parseContext,
|
||||
String queryAnalyzer) throws IOException {
|
||||
FieldMapper<?> mapper = null;
|
||||
String field;
|
||||
MapperService.SmartNameFieldMappers smartNameFieldMappers = parseContext.smartFieldMappers(fieldName);
|
||||
if (smartNameFieldMappers != null && smartNameFieldMappers.hasMapper()) {
|
||||
mapper = smartNameFieldMappers.mapper();
|
||||
field = mapper.names().indexName();
|
||||
} else {
|
||||
field = fieldName;
|
||||
}
|
||||
|
||||
Analyzer analyzer = null;
|
||||
if (queryAnalyzer == null) {
|
||||
if (mapper != null) {
|
||||
analyzer = mapper.searchAnalyzer();
|
||||
}
|
||||
if (analyzer == null && smartNameFieldMappers != null) {
|
||||
analyzer = smartNameFieldMappers.searchAnalyzer();
|
||||
}
|
||||
if (analyzer == null) {
|
||||
analyzer = parseContext.mapperService().searchAnalyzer();
|
||||
}
|
||||
} else {
|
||||
analyzer = parseContext.mapperService().analysisService().analyzer(queryAnalyzer);
|
||||
if (analyzer == null) {
|
||||
throw new ElasticSearchIllegalArgumentException("No analyzer found for [" + queryAnalyzer + "]");
|
||||
}
|
||||
}
|
||||
|
||||
// Logic similar to QueryParser#getFieldQuery
|
||||
TokenStream source = analyzer.tokenStream(field, new FastStringReader(queryString.toString()));
|
||||
source.reset();
|
||||
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
|
||||
int count = 0;
|
||||
while (source.incrementToken()) {
|
||||
BytesRef ref = new BytesRef(termAtt.length() * 4); // oversize for
|
||||
// UTF-8
|
||||
UnicodeUtil.UTF16toUTF8(termAtt.buffer(), 0, termAtt.length(), ref);
|
||||
query.add(new Term(field, ref));
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
|
||||
}
|
||||
}
|
@ -65,6 +65,16 @@ public abstract class QueryBuilders {
|
||||
public static MatchQueryBuilder matchQuery(String name, Object text) {
|
||||
return new MatchQueryBuilder(name, text).type(MatchQueryBuilder.Type.BOOLEAN);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a common query for the provided field name and text.
|
||||
*
|
||||
* @param name The field name.
|
||||
* @param text The query text (to be analyzed).
|
||||
*/
|
||||
public static CommonTermsQueryBuilder commonTerms(String name, Object text) {
|
||||
return new CommonTermsQueryBuilder(name, text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a match query with type "BOOLEAN" for the provided field name and text.
|
||||
|
@ -101,6 +101,8 @@ public class IndicesQueriesModule extends AbstractModule {
|
||||
qpBinders.addBinding().to(FuzzyLikeThisFieldQueryParser.class).asEagerSingleton();
|
||||
qpBinders.addBinding().to(WrapperQueryParser.class).asEagerSingleton();
|
||||
qpBinders.addBinding().to(IndicesQueryParser.class).asEagerSingleton();
|
||||
qpBinders.addBinding().to(CommonTermsQueryParser.class).asEagerSingleton();
|
||||
|
||||
if (ShapesAvailability.JTS_AVAILABLE) {
|
||||
qpBinders.addBinding().to(GeoShapeQueryParser.class).asEagerSingleton();
|
||||
}
|
||||
|
@ -24,6 +24,7 @@ import org.elasticsearch.action.search.SearchResponse;
|
||||
import org.elasticsearch.client.Client;
|
||||
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||
import org.elasticsearch.index.query.*;
|
||||
import org.elasticsearch.index.query.CommonTermsQueryBuilder.Operator;
|
||||
import org.elasticsearch.rest.RestStatus;
|
||||
import org.elasticsearch.search.facet.FacetBuilders;
|
||||
import org.elasticsearch.test.integration.AbstractNodesTests;
|
||||
@ -104,7 +105,43 @@ public class SimpleQueryTests extends AbstractNodesTests {
|
||||
assertTrue(e.getMessage().endsWith("IllegalStateException[field \"field1\" was indexed without position data; cannot run PhraseQuery (term=quick)]; }"));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCommonTermsQuery() throws Exception {
|
||||
try {
|
||||
client.admin().indices().prepareDelete("test").execute().actionGet();
|
||||
} catch (Exception e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
client.admin().indices().prepareCreate("test")
|
||||
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties").startObject("field1").field("analyzer", "whitespace").field("type", "string").endObject().endObject().endObject().endObject())
|
||||
.setSettings(ImmutableSettings.settingsBuilder().put("number_of_shards", 1)).execute().actionGet();
|
||||
|
||||
client.prepareIndex("test", "type1", "1").setSource("field1", "the quick brown fox").execute().actionGet();
|
||||
client.prepareIndex("test", "type1", "2").setSource("field1", "the quick lazy huge brown fox jumps over the tree").execute().actionGet();
|
||||
client.prepareIndex("test", "type1", "3").setSource("field1", "quick lazy huge brown").setRefresh(true).execute().actionGet();
|
||||
|
||||
SearchResponse searchResponse = client.prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3)).execute().actionGet();
|
||||
assertThat(searchResponse.hits().totalHits(), equalTo(2l));
|
||||
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
|
||||
assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2"));
|
||||
|
||||
|
||||
searchResponse = client.prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3).lowFreqOperator(Operator.OR)).execute().actionGet();
|
||||
assertThat(searchResponse.hits().totalHits(), equalTo(3l));
|
||||
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
|
||||
assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2"));
|
||||
assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("3"));
|
||||
|
||||
searchResponse = client.prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3).analyzer("standard")).execute().actionGet();
|
||||
assertThat(searchResponse.hits().totalHits(), equalTo(3l));
|
||||
// standard drops "the" since its a stopword
|
||||
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
|
||||
assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("3"));
|
||||
assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("2"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOmitTermFreqsAndPositions() throws Exception {
|
||||
// backwards compat test!
|
||||
|
Loading…
x
Reference in New Issue
Block a user