Add support for `quote_field_suffix` to `simple_query_string`. (#21060)

Closes #18641
This commit is contained in:
Adrien Grand 2016-10-28 09:11:57 +02:00 committed by GitHub
parent 557506ba6e
commit 9cbbddb6dc
9 changed files with 384 additions and 12 deletions

View File

@ -120,9 +120,18 @@ public class SimpleQueryParser extends org.apache.lucene.queryparser.simple.Simp
bq.setDisableCoord(true);
for (Map.Entry<String,Float> entry : weights.entrySet()) {
try {
Query q = createPhraseQuery(entry.getKey(), text, slop);
String field = entry.getKey();
if (settings.quoteFieldSuffix() != null) {
String quoteField = field + settings.quoteFieldSuffix();
MappedFieldType quotedFieldType = context.fieldMapper(quoteField);
if (quotedFieldType != null) {
field = quoteField;
}
}
Float boost = entry.getValue();
Query q = createPhraseQuery(field, text, slop);
if (q != null) {
bq.add(wrapWithBoost(q, entry.getValue()), BooleanClause.Occur.SHOULD);
bq.add(wrapWithBoost(q, boost), BooleanClause.Occur.SHOULD);
}
} catch (RuntimeException e) {
rethrowUnlessLenient(e);
@ -256,6 +265,8 @@ public class SimpleQueryParser extends org.apache.lucene.queryparser.simple.Simp
private boolean lenient = SimpleQueryStringBuilder.DEFAULT_LENIENT;
/** Specifies whether wildcards should be analyzed. */
private boolean analyzeWildcard = SimpleQueryStringBuilder.DEFAULT_ANALYZE_WILDCARD;
/** Specifies a suffix, if any, to apply to field names for phrase matching. */
private String quoteFieldSuffix = null;
/**
* Generates default {@link Settings} object (uses ROOT locale, does
@ -264,13 +275,6 @@ public class SimpleQueryParser extends org.apache.lucene.queryparser.simple.Simp
public Settings() {
}
public Settings(Locale locale, Boolean lowercaseExpandedTerms, Boolean lenient, Boolean analyzeWildcard) {
this.locale = locale;
this.lowercaseExpandedTerms = lowercaseExpandedTerms;
this.lenient = lenient;
this.analyzeWildcard = analyzeWildcard;
}
/** Specifies the locale to use for parsing, Locale.ROOT by default. */
public void locale(Locale locale) {
this.locale = (locale != null) ? locale : SimpleQueryStringBuilder.DEFAULT_LOCALE;
@ -314,12 +318,27 @@ public class SimpleQueryParser extends org.apache.lucene.queryparser.simple.Simp
return analyzeWildcard;
}
/**
* Set the suffix to append to field names for phrase matching.
*/
public void quoteFieldSuffix(String suffix) {
this.quoteFieldSuffix = suffix;
}
/**
* Return the suffix to append for phrase matching, or {@code null} if
* no suffix should be appended.
*/
public String quoteFieldSuffix() {
return quoteFieldSuffix;
}
@Override
public int hashCode() {
// checking the return value of toLanguageTag() for locales only.
// For further reasoning see
// https://issues.apache.org/jira/browse/LUCENE-4021
return Objects.hash(locale.toLanguageTag(), lowercaseExpandedTerms, lenient, analyzeWildcard);
return Objects.hash(locale.toLanguageTag(), lowercaseExpandedTerms, lenient, analyzeWildcard, quoteFieldSuffix);
}
@Override
@ -338,7 +357,8 @@ public class SimpleQueryParser extends org.apache.lucene.queryparser.simple.Simp
return (Objects.equals(locale.toLanguageTag(), other.locale.toLanguageTag())
&& Objects.equals(lowercaseExpandedTerms, other.lowercaseExpandedTerms)
&& Objects.equals(lenient, other.lenient)
&& Objects.equals(analyzeWildcard, other.analyzeWildcard));
&& Objects.equals(analyzeWildcard, other.analyzeWildcard)
&& Objects.equals(quoteFieldSuffix, other.quoteFieldSuffix));
}
}
}

View File

@ -22,6 +22,7 @@ package org.elasticsearch.index.query;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.elasticsearch.Version;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.common.Strings;
@ -94,6 +95,8 @@ public class SimpleQueryStringBuilder extends AbstractQueryBuilder<SimpleQuerySt
/** Name for (de-)serialization. */
public static final String NAME = "simple_query_string";
public static final Version V_5_1_0_UNRELEASED = Version.fromId(5010099);
private static final ParseField MINIMUM_SHOULD_MATCH_FIELD = new ParseField("minimum_should_match");
private static final ParseField ANALYZE_WILDCARD_FIELD = new ParseField("analyze_wildcard");
private static final ParseField LENIENT_FIELD = new ParseField("lenient");
@ -104,6 +107,7 @@ public class SimpleQueryStringBuilder extends AbstractQueryBuilder<SimpleQuerySt
private static final ParseField ANALYZER_FIELD = new ParseField("analyzer");
private static final ParseField QUERY_FIELD = new ParseField("query");
private static final ParseField FIELDS_FIELD = new ParseField("fields");
private static final ParseField QUOTE_FIELD_SUFFIX_FIELD = new ParseField("quote_field_suffix");
/** Query text to parse. */
private final String queryText;
@ -158,6 +162,9 @@ public class SimpleQueryStringBuilder extends AbstractQueryBuilder<SimpleQuerySt
settings.analyzeWildcard(in.readBoolean());
settings.locale(Locale.forLanguageTag(in.readString()));
minimumShouldMatch = in.readOptionalString();
if (in.getVersion().onOrAfter(V_5_1_0_UNRELEASED)) {
settings.quoteFieldSuffix(in.readOptionalString());
}
}
@Override
@ -176,6 +183,9 @@ public class SimpleQueryStringBuilder extends AbstractQueryBuilder<SimpleQuerySt
out.writeBoolean(settings.analyzeWildcard());
out.writeString(settings.locale().toLanguageTag());
out.writeOptionalString(minimumShouldMatch);
if (out.getVersion().onOrAfter(V_5_1_0_UNRELEASED)) {
out.writeOptionalString(settings.quoteFieldSuffix());
}
}
/** Returns the text to parse the query from. */
@ -292,6 +302,21 @@ public class SimpleQueryStringBuilder extends AbstractQueryBuilder<SimpleQuerySt
return this.settings.locale();
}
/**
* Set the suffix to append to field names for phrase matching.
*/
public SimpleQueryStringBuilder quoteFieldSuffix(String suffix) {
settings.quoteFieldSuffix(suffix);
return this;
}
/**
* Return the suffix to append to field names for phrase matching.
*/
public String quoteFieldSuffix() {
return settings.quoteFieldSuffix();
}
/** Specifies whether query parsing should be lenient. Defaults to false. */
public SimpleQueryStringBuilder lenient(boolean lenient) {
this.settings.lenient(lenient);
@ -408,6 +433,9 @@ public class SimpleQueryStringBuilder extends AbstractQueryBuilder<SimpleQuerySt
builder.field(LENIENT_FIELD.getPreferredName(), settings.lenient());
builder.field(ANALYZE_WILDCARD_FIELD.getPreferredName(), settings.analyzeWildcard());
builder.field(LOCALE_FIELD.getPreferredName(), (settings.locale().toLanguageTag()));
if (settings.quoteFieldSuffix() != null) {
builder.field(QUOTE_FIELD_SUFFIX_FIELD.getPreferredName(), settings.quoteFieldSuffix());
}
if (minimumShouldMatch != null) {
builder.field(MINIMUM_SHOULD_MATCH_FIELD.getPreferredName(), minimumShouldMatch);
@ -433,6 +461,7 @@ public class SimpleQueryStringBuilder extends AbstractQueryBuilder<SimpleQuerySt
boolean lowercaseExpandedTerms = SimpleQueryStringBuilder.DEFAULT_LOWERCASE_EXPANDED_TERMS;
boolean analyzeWildcard = SimpleQueryStringBuilder.DEFAULT_ANALYZE_WILDCARD;
Locale locale = null;
String quoteFieldSuffix = null;
XContentParser.Token token;
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
@ -495,6 +524,8 @@ public class SimpleQueryStringBuilder extends AbstractQueryBuilder<SimpleQuerySt
queryName = parser.text();
} else if (parseContext.getParseFieldMatcher().match(currentFieldName, MINIMUM_SHOULD_MATCH_FIELD)) {
minimumShouldMatch = parser.textOrNull();
} else if (parseContext.getParseFieldMatcher().match(currentFieldName, QUOTE_FIELD_SUFFIX_FIELD)) {
quoteFieldSuffix = parser.textOrNull();
} else {
throw new ParsingException(parser.getTokenLocation(), "[" + SimpleQueryStringBuilder.NAME +
"] unsupported field [" + parser.currentName() + "]");
@ -513,7 +544,7 @@ public class SimpleQueryStringBuilder extends AbstractQueryBuilder<SimpleQuerySt
SimpleQueryStringBuilder qb = new SimpleQueryStringBuilder(queryBody);
qb.boost(boost).fields(fieldsAndWeights).analyzer(analyzerName).queryName(queryName).minimumShouldMatch(minimumShouldMatch);
qb.flags(flags).defaultOperator(defaultOperator).locale(locale).lowercaseExpandedTerms(lowercaseExpandedTerms);
qb.lenient(lenient).analyzeWildcard(analyzeWildcard).boost(boost);
qb.lenient(lenient).analyzeWildcard(analyzeWildcard).boost(boost).quoteFieldSuffix(quoteFieldSuffix);
return Optional.of(qb);
}

View File

@ -24,6 +24,7 @@ import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.monitor.os.OsStats;
import org.elasticsearch.index.query.SimpleQueryStringBuilder;
import org.elasticsearch.search.internal.AliasFilter;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.VersionUtils;
@ -273,6 +274,7 @@ public class VersionTests extends ESTestCase {
expectThrows(AssertionError.class, () -> assertUnknownVersion(Version.CURRENT));
assertUnknownVersion(AliasFilter.V_5_1_0); // once we released 5.1.0 and it's added to Version.java we need to remove this constant
assertUnknownVersion(OsStats.V_5_1_0); // once we released 5.1.0 and it's added to Version.java we need to remove this constant
assertUnknownVersion(SimpleQueryStringBuilder.V_5_1_0_UNRELEASED);
// once we released 5.0.0 and it's added to Version.java we need to remove this constant
assertUnknownVersion(ShardValidateQueryRequestTests.V_5_0_0);
}

View File

@ -28,14 +28,47 @@ import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.mapper.ContentPath;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.index.mapper.MockFieldMapper;
import org.elasticsearch.index.mapper.TextFieldMapper;
import org.elasticsearch.indices.query.IndicesQueriesRegistry;
import org.elasticsearch.search.SearchModule;
import org.elasticsearch.test.ESTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
import org.elasticsearch.common.settings.Settings;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import static java.util.Collections.emptyList;
import static org.hamcrest.Matchers.equalTo;
public class SimpleQueryParserTests extends ESTestCase {
private static IndicesQueriesRegistry indicesQueriesRegistry;
/**
* setup for the whole base test class
*/
@BeforeClass
public static void init() {
SearchModule searchModule = new SearchModule(Settings.EMPTY, false, emptyList());
indicesQueriesRegistry = searchModule.getQueryParserRegistry();
}
@AfterClass
public static void afterClass() throws Exception {
indicesQueriesRegistry = null;
}
private static class MockSimpleQueryParser extends SimpleQueryParser {
public MockSimpleQueryParser(Analyzer analyzer, Map<String, Float> weights, int flags, Settings settings) {
super(analyzer, weights, flags, settings, null);
@ -106,4 +139,45 @@ public class SimpleQueryParserTests extends ESTestCase {
}
}
public void testQuoteFieldSuffix() {
SimpleQueryParser.Settings sqpSettings = new SimpleQueryParser.Settings();
sqpSettings.quoteFieldSuffix(".quote");
Settings indexSettings = Settings.builder()
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
.put(IndexMetaData.SETTING_INDEX_UUID, "some_uuid")
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.build();
IndexMetaData indexState = IndexMetaData.builder("index").settings(indexSettings).build();
IndexSettings settings = new IndexSettings(indexState, Settings.EMPTY);
QueryShardContext mockShardContext = new QueryShardContext(0, settings, null, null, null, null, null, indicesQueriesRegistry,
null, null, null, System::currentTimeMillis) {
@Override
public MappedFieldType fieldMapper(String name) {
return new MockFieldMapper.FakeFieldType();
}
};
SimpleQueryParser parser = new SimpleQueryParser(new StandardAnalyzer(),
Collections.singletonMap("foo", 1f), -1, sqpSettings, mockShardContext);
assertEquals(new TermQuery(new Term("foo", "bar")), parser.parse("bar"));
assertEquals(new TermQuery(new Term("foo.quote", "bar")), parser.parse("\"bar\""));
// Now check what happens if foo.quote does not exist
mockShardContext = new QueryShardContext(0, settings, null, null, null, null, null, indicesQueriesRegistry,
null, null, null, System::currentTimeMillis) {
@Override
public MappedFieldType fieldMapper(String name) {
if (name.equals("foo.quote")) {
return null;
}
return new MockFieldMapper.FakeFieldType();
}
};
parser = new SimpleQueryParser(new StandardAnalyzer(),
Collections.singletonMap("foo", 1f), -1, sqpSettings, mockShardContext);
assertEquals(new TermQuery(new Term("foo", "bar")), parser.parse("bar"));
assertEquals(new TermQuery(new Term("foo", "bar")), parser.parse("\"bar\""));
}
}

View File

@ -19,6 +19,8 @@
package org.elasticsearch.index.query;
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
@ -27,6 +29,7 @@ import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.TestUtil;
import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.search.internal.SearchContext;
import org.elasticsearch.test.AbstractQueryTestCase;
@ -71,6 +74,9 @@ public class SimpleQueryStringBuilderTests extends AbstractQueryTestCase<SimpleQ
if (randomBoolean()) {
result.defaultOperator(randomFrom(Operator.values()));
}
if (randomBoolean()) {
result.quoteFieldSuffix(TestUtil.randomSimpleString(random()));
}
if (randomBoolean()) {
Set<SimpleQueryStringFlag> flagSet = new HashSet<>();
int size = randomIntBetween(0, SimpleQueryStringFlag.values().length);
@ -334,6 +340,7 @@ public class SimpleQueryStringBuilderTests extends AbstractQueryTestCase<SimpleQ
" \"lenient\" : false,\n" +
" \"analyze_wildcard\" : false,\n" +
" \"locale\" : \"und\",\n" +
" \"quote_field_suffix\" : \".quote\",\n" +
" \"boost\" : 1.0\n" +
" }\n" +
"}";
@ -344,6 +351,7 @@ public class SimpleQueryStringBuilderTests extends AbstractQueryTestCase<SimpleQ
assertEquals(json, "\"fried eggs\" +(eggplant | potato) -frittata", parsed.value());
assertEquals(json, 2, parsed.fields().size());
assertEquals(json, "snowball", parsed.analyzer());
assertEquals(json, ".quote", parsed.quoteFieldSuffix());
}
public void testMinimumShouldMatch() throws IOException {

View File

@ -17,6 +17,8 @@ made.
include::how-to/general.asciidoc[]
include::how-to/recipes.asciidoc[]
include::how-to/indexing-speed.asciidoc[]
include::how-to/search-speed.asciidoc[]

View File

@ -0,0 +1,225 @@
[[recipes]]
== Recipes
[float]
[[mixing-exact-search-with-stemming]]
=== Mixing exact search with stemming
When building a search application, stemming is often a must as it is desirable
for a query on `skiing` to match documents that contain `ski` or `skis`. But
what if a user wants to search for `skiing` specifically? The typical way to do
this would be to use a <<multi-fields,multi-field>> in order to have the same
content indexed in two different ways:
[source,js]
--------------------------------------------------
PUT index
{
"settings": {
"analysis": {
"analyzer": {
"english_exact": {
"tokenizer": "standard",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"type": {
"properties": {
"body": {
"type": "text",
"analyzer": "english",
"fields": {
"exact": {
"type": "text",
"analyzer": "english_exact"
}
}
}
}
}
}
}
PUT index/type/1
{
"body": "Ski resort"
}
PUT index/type/2
{
"body": "A pair of skis"
}
POST index/_refresh
--------------------------------------------------
// CONSOLE
With such a setup, searching for `ski` on `body` would return both documents:
[source,js]
--------------------------------------------------
GET index/_search
{
"query": {
"simple_query_string": {
"fields": [ "body" ],
"query": "ski"
}
}
}
--------------------------------------------------
// CONSOLE
// TEST[continued]
[source,js]
--------------------------------------------------
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0.25811607,
"hits": [
{
"_index": "index",
"_type": "type",
"_id": "2",
"_score": 0.25811607,
"_source": {
"body": "A pair of skis"
}
},
{
"_index": "index",
"_type": "type",
"_id": "1",
"_score": 0.25811607,
"_source": {
"body": "Ski resort"
}
}
]
}
}
--------------------------------------------------
// TESTRESPONSE[s/"took": 2,/"took": "$body.took",/]
On the other hand, searching for `ski` on `body.exact` would only return
document `1` since the analysis chain of `body.exact` does not perform
stemming.
[source,js]
--------------------------------------------------
GET index/_search
{
"query": {
"simple_query_string": {
"fields": [ "body.exact" ],
"query": "ski"
}
}
}
--------------------------------------------------
// CONSOLE
// TEST[continued]
[source,js]
--------------------------------------------------
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.25811607,
"hits": [
{
"_index": "index",
"_type": "type",
"_id": "1",
"_score": 0.25811607,
"_source": {
"body": "Ski resort"
}
}
]
}
}
--------------------------------------------------
// TESTRESPONSE[s/"took": 1,/"took": "$body.took",/]
This is not something that is easy to expose to end users, as we would need to
have a way to figure out whether they are looking for an exact match or not and
redirect to the appropriate field accordingly. Also what to do if only parts of
the query need to be matched exactly while other parts should still take
stemming into account?
Fortunately, the `query_string` and `simple_query_string` queries have a feature
that allows to solve exactly this problem: `quote_field_suffix`. It allows to
tell Elasticsearch that words that appear in between quotes should be redirected
to a different field, see below:
[source,js]
--------------------------------------------------
GET index/_search
{
"query": {
"simple_query_string": {
"fields": [ "body" ],
"quote_field_suffix": ".exact",
"query": "\"ski\""
}
}
}
--------------------------------------------------
// CONSOLE
// TEST[continued]
[source,js]
--------------------------------------------------
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.25811607,
"hits": [
{
"_index": "index",
"_type": "type",
"_id": "1",
"_score": 0.25811607,
"_source": {
"body": "Ski resort"
}
}
]
}
}
--------------------------------------------------
// TESTRESPONSE[s/"took": 2,/"took": "$body.took",/]
In that case, since `ski` was in-between quotes, it was searched on the
`body.exact` field due to the `quote_field_suffix` parameter, so only document
`1` matched. This allows users to mix exact search with stemmed search as they
like.

View File

@ -85,6 +85,11 @@ Defaults to `ROOT`.
|`time_zone` | Time Zone to be applied to any range query related to dates. See also
http://www.joda.org/joda-time/apidocs/org/joda/time/DateTimeZone.html[JODA timezone].
|`quote_field_suffix` | A suffix to append to fields for quoted parts of
the query string. This allows to use a field that has a different analysis chain
for exact matching. Look <<mixing-exact-search-with-stemming,here>> for a
comprehensive example.
|=======================================================================
When a multi term query is being generated, one can control how it gets

View File

@ -63,6 +63,11 @@ Defaults to `ROOT`.
document to be returned. See the
<<query-dsl-minimum-should-match,`minimum_should_match`>> documentation for the
full list of options.
|`quote_field_suffix` | A suffix to append to fields for quoted parts of
the query string. This allows to use a field that has a different analysis chain
for exact matching. Look <<mixing-exact-search-with-stemming,here>> for a
comprehensive example.
|=======================================================================
[float]