Core: add max_determinized_states to query_string and regexp query/filter

This prevents too-difficult regular expressions from consuming
excessive RAM/CPU; the default max_determinized_states is 10,000 (same
as Lucene) but query_string and regepx query/filter can override
per-request.

The also upgrades to a new Lucene 5.0.0 snapshot.

Closes #8386

Closes #8357
This commit is contained in:
Michael McCandless 2014-11-10 13:43:48 -05:00 committed by mikemccand
parent f9810e591e
commit 8aebb9656b
30 changed files with 261 additions and 75 deletions

View File

@ -7,5 +7,3 @@ java.io.File#delete() @ use Files.delete for real exception, IOUtils.deleteFiles
# temporary situation, until we upgrade with LUCENE-6051 fix
# (at which point forbidden apis will fail and we remove this section)
@defaultMessage Use FileSystemUtils methods for now to workaround LUCENE-6051
org.apache.lucene.util.IOUtils#deleteFilesIgnoringExceptions(java.lang.Iterable)
org.apache.lucene.util.IOUtils#deleteFilesIfExist(java.lang.Iterable)

View File

@ -27,7 +27,14 @@ See <<regexp-syntax>> for details of the supported regular expression language.
You can also select the cache name and use the same regexp flags in the
filter as in the query.
*Note*: You have to enable caching explicitly in order to have the
Regular expressions are dangerous because it's easy to accidentally
create an innocuous looking one that requires an exponential number of
internal determinized automaton states (and corresponding RAM and CPU)
for Lucene to execute. Lucene prevents these using the
`max_determinized_states` setting (defaults to 10000). You can raise
this limit to allow more complex regular expressions to execute.
You have to enable caching explicitly in order to have the
`regexp` filter cached.
[source,js]
@ -41,7 +48,8 @@ filter as in the query.
"regexp":{
"name.first" : {
"value" : "s.*y",
"flags" : "INTERSECTION|COMPLEMENT|EMPTY"
"flags" : "INTERSECTION|COMPLEMENT|EMPTY",
"max_determinized_states": 20000
},
"_name":"test",
"_cache" : true,

View File

@ -61,7 +61,11 @@ phrase matches are required. Default value is `0`.
not analyzed. By setting this value to `true`, a best effort will be
made to analyze those as well.
|`auto_generate_phrase_queries` |Default to `false`.
|`auto_generate_phrase_queries` |Defaults to `false`.
|`max_determinized_states` |Limit on how many automaton states regexp
queries are allowed to create. This protects against too-difficult
(e.g. exponentially hard) regexps. Defaults to 10000.
|`minimum_should_match` |A value controlling how many "should" clauses
in the resulting boolean query should match. It can be an absolute value

View File

@ -55,5 +55,25 @@ Possible flags are `ALL`, `ANYSTRING`, `AUTOMATON`, `COMPLEMENT`,
http://lucene.apache.org/core/4_9_0/core/org/apache/lucene/util/automaton/RegExp.html[Lucene
documentation] for their meaning
Regular expressions are dangerous because it's easy to accidentally
create an innocuous looking one that requires an exponential number of
internal determinized automaton states (and corresponding RAM and CPU)
for Lucene to execute. Lucene prevents these using the
`max_determinized_states` setting (defaults to 10000). You can raise
this limit to allow more complex regular expressions to execute.
[source,js]
--------------------------------------------------
{
"regexp":{
"name.first": {
"value": "s.*y",
"flags" : "INTERSECTION|COMPLEMENT|EMPTY",
"max_determinized_states": 20000
}
}
}
--------------------------------------------------
include::regexp-syntax.asciidoc[]

View File

@ -32,7 +32,7 @@
<properties>
<lucene.version>5.0.0</lucene.version>
<lucene.maven.version>5.0.0-snapshot-1636426</lucene.maven.version>
<lucene.maven.version>5.0.0-snapshot-1637347</lucene.maven.version>
<tests.jvms>auto</tests.jvms>
<tests.shuffle>true</tests.shuffle>
<tests.output>onerror</tests.output>
@ -52,7 +52,11 @@
</repository>
<repository>
<id>Lucene snapshots</id>
<url>https://download.elasticsearch.org/lucenesnapshots/maven/</url>
<url>https://download.elasticsearch.org/lucenesnapshots/1637347</url>
</repository>
<repository>
<id>Temporary</id>
<url>http://people.apache.org/~mikemccand/fake_staging_area/repo/</url>
</repository>
</repositories>

View File

@ -21,6 +21,7 @@ package org.apache.lucene.queryparser.classic;
import com.google.common.base.Objects;
import com.google.common.collect.ImmutableMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -123,6 +124,7 @@ public class MapperQueryParser extends QueryParser {
setMultiTermRewriteMethod(settings.rewriteMethod());
setEnablePositionIncrements(settings.enablePositionIncrements());
setAutoGeneratePhraseQueries(settings.autoGeneratePhraseQueries());
setMaxDeterminizedStates(settings.maxDeterminizedStates());
setAllowLeadingWildcard(settings.allowLeadingWildcard());
setLowercaseExpandedTerms(settings.lowercaseExpandedTerms());
setPhraseSlop(settings.phraseSlop());
@ -814,12 +816,12 @@ public class MapperQueryParser extends QueryParser {
if (fieldMappers.explicitTypeInNameWithDocMapper()) {
String[] previousTypes = QueryParseContext.setTypesWithPrevious(new String[]{fieldMappers.docMapper().type()});
try {
query = currentMapper.regexpQuery(termStr, RegExp.ALL, multiTermRewriteMethod, parseContext);
query = currentMapper.regexpQuery(termStr, RegExp.ALL, maxDeterminizedStates, multiTermRewriteMethod, parseContext);
} finally {
QueryParseContext.setTypes(previousTypes);
}
} else {
query = currentMapper.regexpQuery(termStr, RegExp.ALL, multiTermRewriteMethod, parseContext);
query = currentMapper.regexpQuery(termStr, RegExp.ALL, maxDeterminizedStates, multiTermRewriteMethod, parseContext);
}
}
if (query == null) {

View File

@ -20,9 +20,11 @@
package org.apache.lucene.queryparser.classic;
import com.carrotsearch.hppc.ObjectFloatOpenHashMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.util.automaton.Operations;
import org.joda.time.DateTimeZone;
import java.util.Collection;
@ -50,6 +52,7 @@ public class QueryParserSettings {
private float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity;
private int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength;
private int fuzzyMaxExpansions = FuzzyQuery.defaultMaxExpansions;
private int maxDeterminizedStates = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
private MultiTermQuery.RewriteMethod fuzzyRewriteMethod = null;
private boolean analyzeWildcard = DEFAULT_ANALYZE_WILDCARD;
private boolean escape = false;
@ -115,6 +118,14 @@ public class QueryParserSettings {
this.autoGeneratePhraseQueries = autoGeneratePhraseQueries;
}
public int maxDeterminizedStates() {
return maxDeterminizedStates;
}
public void maxDeterminizedStates(int maxDeterminizedStates) {
this.maxDeterminizedStates = maxDeterminizedStates;
}
public boolean allowLeadingWildcard() {
return allowLeadingWildcard;
}
@ -323,6 +334,7 @@ public class QueryParserSettings {
QueryParserSettings that = (QueryParserSettings) o;
if (autoGeneratePhraseQueries != that.autoGeneratePhraseQueries()) return false;
if (maxDeterminizedStates != that.maxDeterminizedStates()) return false;
if (allowLeadingWildcard != that.allowLeadingWildcard) return false;
if (Float.compare(that.boost, boost) != 0) return false;
if (enablePositionIncrements != that.enablePositionIncrements) return false;
@ -378,6 +390,7 @@ public class QueryParserSettings {
result = 31 * result + (boost != +0.0f ? Float.floatToIntBits(boost) : 0);
result = 31 * result + (defaultOperator != null ? defaultOperator.hashCode() : 0);
result = 31 * result + (autoGeneratePhraseQueries ? 1 : 0);
result = 31 * result + maxDeterminizedStates;
result = 31 * result + (allowLeadingWildcard ? 1 : 0);
result = 31 * result + (lowercaseExpandedTerms ? 1 : 0);
result = 31 * result + (enablePositionIncrements ? 1 : 0);

View File

@ -19,6 +19,7 @@
package org.apache.lucene.search.suggest.analyzing;
import com.carrotsearch.hppc.ObjectIntOpenHashMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
@ -312,7 +313,8 @@ public class XAnalyzingSuggester extends Lookup {
protected Automaton convertAutomaton(Automaton a) {
if (queryPrefix != null) {
a = Operations.concatenate(Arrays.asList(queryPrefix, a));
a = Operations.determinize(a);
// This automaton should not blow up during determinize:
a = Operations.determinize(a, Integer.MAX_VALUE);
}
return a;
}
@ -952,14 +954,16 @@ public class XAnalyzingSuggester extends Lookup {
try {
automaton = getTokenStreamToAutomaton().toAutomaton(ts);
} finally {
IOUtils.closeWhileHandlingException(ts);
IOUtils.closeWhileHandlingException(ts);
}
automaton = replaceSep(automaton);
// TODO: we can optimize this somewhat by determinizing
// while we convert
automaton = Operations.determinize(automaton);
// This automaton should not blow up during determinize:
automaton = Operations.determinize(automaton, Integer.MAX_VALUE);
return automaton;
}

View File

@ -205,7 +205,8 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
if (unicodeAware) {
// FLORIAN EDIT: get converted Automaton from superclass
Automaton utf8automaton = new UTF32ToUTF8().convert(super.convertAutomaton(a));
utf8automaton = Operations.determinize(utf8automaton);
// This automaton should not blow up during determinize:
utf8automaton = Operations.determinize(utf8automaton, Integer.MAX_VALUE);
return utf8automaton;
} else {
return super.convertAutomaton(a);
@ -253,7 +254,9 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
Automaton a = Operations.union(Arrays.asList(subs));
// TODO: we could call toLevenshteinAutomata() before det?
// this only happens if you have multiple paths anyway (e.g. synonyms)
return Operations.determinize(a);
// This automaton should not blow up during determinize:
return Operations.determinize(a, Integer.MAX_VALUE);
}
}
}

View File

@ -22,7 +22,6 @@ package org.elasticsearch.common.http.client;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.ElasticsearchTimeoutException;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.io.FileSystemUtils;
import org.elasticsearch.common.unit.TimeValue;
import java.io.*;
@ -347,7 +346,7 @@ public class HttpDownloadHelper {
// Try to delete the garbage we'd otherwise leave
// behind.
IOUtils.closeWhileHandlingException(os, is);
FileSystemUtils.deleteFilesIgnoringExceptions(dest.toPath());
IOUtils.deleteFilesIgnoringExceptions(dest.toPath());
} else {
IOUtils.close(os, is);
}
@ -386,7 +385,7 @@ public class HttpDownloadHelper {
} else {
IOUtils.closeWhileHandlingException(is, os);
if (dest != null && dest.exists()) {
FileSystemUtils.deleteFilesIgnoringExceptions(dest.toPath());
IOUtils.deleteFilesIgnoringExceptions(dest.toPath());
}
}
}

View File

@ -131,22 +131,6 @@ public class FileSystemUtils {
private FileSystemUtils() {}
/**
* Temporary solution until LUCENE-6051 is fixed
* @see org.apache.lucene.util.IOUtils#deleteFilesIgnoringExceptions(java.nio.file.Path...)
*/
public static void deleteFilesIgnoringExceptions(Path... files) {
for (Path name : files) {
if (name != null) {
try {
Files.delete(name);
} catch (Throwable ignored) {
// ignore
}
}
}
}
/**
* This utility copy a full directory content (excluded) under
* a new directory but without overwriting existing files.

View File

@ -25,6 +25,7 @@ import org.apache.lucene.search.Filter;
import org.apache.lucene.search.MultiTermQueryWrapperFilter;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import java.io.IOException;
@ -47,7 +48,11 @@ public class RegexpFilter extends Filter {
}
public RegexpFilter(Term term, int flags) {
filter = new InternalFilter(term, flags);
this(term, flags, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
public RegexpFilter(Term term, int flags, int maxDeterminizedStates) {
filter = new InternalFilter(term, flags, maxDeterminizedStates);
this.term = term;
this.flags = flags;
}
@ -97,12 +102,8 @@ public class RegexpFilter extends Filter {
static class InternalFilter extends MultiTermQueryWrapperFilter<RegexpQuery> {
public InternalFilter(Term term) {
super(new RegexpQuery(term));
}
public InternalFilter(Term term, int flags) {
super(new RegexpQuery(term, flags));
public InternalFilter(Term term, int flags, int maxDeterminizedStates) {
super(new RegexpQuery(term, flags, maxDeterminizedStates));
}
}

View File

@ -20,6 +20,7 @@
package org.elasticsearch.index.mapper;
import com.google.common.base.Strings;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.Term;
@ -260,9 +261,9 @@ public interface FieldMapper<T> extends Mapper {
Filter prefixFilter(Object value, @Nullable QueryParseContext context);
Query regexpQuery(Object value, int flags, @Nullable MultiTermQuery.RewriteMethod method, @Nullable QueryParseContext context);
Query regexpQuery(Object value, int flags, int maxDeterminizedStates, @Nullable MultiTermQuery.RewriteMethod method, @Nullable QueryParseContext context);
Filter regexpFilter(Object value, int flags, @Nullable QueryParseContext parseContext);
Filter regexpFilter(Object value, int flags, int maxDeterminizedStates, @Nullable QueryParseContext parseContext);
/**
* A term query to use when parsing a query string. Can return <tt>null</tt>.

View File

@ -24,6 +24,7 @@ import com.carrotsearch.hppc.cursors.ObjectCursor;
import com.carrotsearch.hppc.cursors.ObjectObjectCursor;
import com.google.common.base.Objects;
import com.google.common.collect.ImmutableList;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
@ -553,8 +554,8 @@ public abstract class AbstractFieldMapper<T> implements FieldMapper<T> {
}
@Override
public Query regexpQuery(Object value, int flags, @Nullable MultiTermQuery.RewriteMethod method, @Nullable QueryParseContext context) {
RegexpQuery query = new RegexpQuery(names().createIndexNameTerm(indexedValueForSearch(value)), flags);
public Query regexpQuery(Object value, int flags, int maxDeterminizedStates, @Nullable MultiTermQuery.RewriteMethod method, @Nullable QueryParseContext context) {
RegexpQuery query = new RegexpQuery(names().createIndexNameTerm(indexedValueForSearch(value)), flags, maxDeterminizedStates);
if (method != null) {
query.setRewriteMethod(method);
}
@ -562,8 +563,8 @@ public abstract class AbstractFieldMapper<T> implements FieldMapper<T> {
}
@Override
public Filter regexpFilter(Object value, int flags, @Nullable QueryParseContext parseContext) {
return new RegexpFilter(names().createIndexNameTerm(indexedValueForSearch(value)), flags);
public Filter regexpFilter(Object value, int flags, int maxDeterminizedStates, @Nullable QueryParseContext parseContext) {
return new RegexpFilter(names().createIndexNameTerm(indexedValueForSearch(value)), flags, maxDeterminizedStates);
}
@Override

View File

@ -20,6 +20,7 @@
package org.elasticsearch.index.mapper.internal;
import com.google.common.collect.Iterables;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
@ -235,13 +236,14 @@ public class IdFieldMapper extends AbstractFieldMapper<String> implements Intern
}
@Override
public Query regexpQuery(Object value, int flags, @Nullable MultiTermQuery.RewriteMethod method, @Nullable QueryParseContext context) {
public Query regexpQuery(Object value, int flags, int maxDeterminizedStates, @Nullable MultiTermQuery.RewriteMethod method, @Nullable QueryParseContext context) {
if (fieldType.indexOptions() != IndexOptions.NONE || context == null) {
return super.regexpQuery(value, flags, method, context);
return super.regexpQuery(value, flags, maxDeterminizedStates, method, context);
}
Collection<String> queryTypes = context.queryTypes();
if (queryTypes.size() == 1) {
RegexpQuery regexpQuery = new RegexpQuery(new Term(UidFieldMapper.NAME, Uid.createUidAsBytes(Iterables.getFirst(queryTypes, null), BytesRefs.toBytesRef(value))), flags);
RegexpQuery regexpQuery = new RegexpQuery(new Term(UidFieldMapper.NAME, Uid.createUidAsBytes(Iterables.getFirst(queryTypes, null), BytesRefs.toBytesRef(value))),
flags, maxDeterminizedStates);
if (method != null) {
regexpQuery.setRewriteMethod(method);
}
@ -249,7 +251,7 @@ public class IdFieldMapper extends AbstractFieldMapper<String> implements Intern
}
BooleanQuery query = new BooleanQuery();
for (String queryType : queryTypes) {
RegexpQuery regexpQuery = new RegexpQuery(new Term(UidFieldMapper.NAME, Uid.createUidAsBytes(queryType, BytesRefs.toBytesRef(value))), flags);
RegexpQuery regexpQuery = new RegexpQuery(new Term(UidFieldMapper.NAME, Uid.createUidAsBytes(queryType, BytesRefs.toBytesRef(value))), flags, maxDeterminizedStates);
if (method != null) {
regexpQuery.setRewriteMethod(method);
}
@ -258,17 +260,19 @@ public class IdFieldMapper extends AbstractFieldMapper<String> implements Intern
return query;
}
public Filter regexpFilter(Object value, int flags, @Nullable QueryParseContext context) {
public Filter regexpFilter(Object value, int flags, int maxDeterminizedStates, @Nullable QueryParseContext context) {
if (fieldType.indexOptions() != IndexOptions.NONE || context == null) {
return super.regexpFilter(value, flags, context);
return super.regexpFilter(value, flags, maxDeterminizedStates, context);
}
Collection<String> queryTypes = context.queryTypes();
if (queryTypes.size() == 1) {
return new RegexpFilter(new Term(UidFieldMapper.NAME, Uid.createUidAsBytes(Iterables.getFirst(queryTypes, null), BytesRefs.toBytesRef(value))), flags);
return new RegexpFilter(new Term(UidFieldMapper.NAME, Uid.createUidAsBytes(Iterables.getFirst(queryTypes, null), BytesRefs.toBytesRef(value))),
flags, maxDeterminizedStates);
}
XBooleanFilter filter = new XBooleanFilter();
for (String queryType : queryTypes) {
filter.add(new RegexpFilter(new Term(UidFieldMapper.NAME, Uid.createUidAsBytes(queryType, BytesRefs.toBytesRef(value))), flags), BooleanClause.Occur.SHOULD);
filter.add(new RegexpFilter(new Term(UidFieldMapper.NAME, Uid.createUidAsBytes(queryType, BytesRefs.toBytesRef(value))),
flags, maxDeterminizedStates), BooleanClause.Occur.SHOULD);
}
return filter;
}

View File

@ -20,6 +20,7 @@
package org.elasticsearch.index.query;
import com.carrotsearch.hppc.ObjectFloatOpenHashMap;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.xcontent.XContentBuilder;
@ -95,6 +96,9 @@ public class QueryStringQueryBuilder extends BaseQueryBuilder implements Boostab
private String timeZone;
/** To limit effort spent determinizing regexp queries. */
private Integer maxDeterminizedStates;
public QueryStringQueryBuilder(String queryString) {
this.queryString = queryString;
}
@ -200,6 +204,14 @@ public class QueryStringQueryBuilder extends BaseQueryBuilder implements Boostab
return this;
}
/**
* Protects against too-difficult regular expression queries.
*/
public QueryStringQueryBuilder maxDeterminizedStates(int maxDeterminizedStates) {
this.maxDeterminizedStates = maxDeterminizedStates;
return this;
}
/**
* Should leading wildcards be allowed or not. Defaults to <tt>true</tt>.
*/
@ -364,6 +376,9 @@ public class QueryStringQueryBuilder extends BaseQueryBuilder implements Boostab
if (autoGeneratePhraseQueries != null) {
builder.field("auto_generate_phrase_queries", autoGeneratePhraseQueries);
}
if (maxDeterminizedStates != null) {
builder.field("max_determinized_states", maxDeterminizedStates);
}
if (allowLeadingWildcard != null) {
builder.field("allow_leading_wildcard", allowLeadingWildcard);
}

View File

@ -21,6 +21,7 @@ package org.elasticsearch.index.query;
import com.carrotsearch.hppc.ObjectFloatOpenHashMap;
import com.google.common.collect.Lists;
import org.apache.lucene.queryparser.classic.MapperQueryParser;
import org.apache.lucene.queryparser.classic.QueryParserSettings;
import org.apache.lucene.search.BooleanQuery;
@ -157,6 +158,8 @@ public class QueryStringQueryParser implements QueryParser {
qpSettings.allowLeadingWildcard(parser.booleanValue());
} else if ("auto_generate_phrase_queries".equals(currentFieldName) || "autoGeneratePhraseQueries".equals(currentFieldName)) {
qpSettings.autoGeneratePhraseQueries(parser.booleanValue());
} else if ("max_determinized_states".equals(currentFieldName) || "maxDeterminizedStates".equals(currentFieldName)) {
qpSettings.maxDeterminizedStates(parser.intValue());
} else if ("lowercase_expanded_terms".equals(currentFieldName) || "lowercaseExpandedTerms".equals(currentFieldName)) {
qpSettings.lowercaseExpandedTerms(parser.booleanValue());
} else if ("enable_position_increments".equals(currentFieldName) || "enablePositionIncrements".equals(currentFieldName)) {

View File

@ -19,6 +19,7 @@
package org.elasticsearch.index.query;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.common.xcontent.XContentBuilder;
import java.io.IOException;
@ -34,6 +35,8 @@ public class RegexpFilterBuilder extends BaseFilterBuilder {
private final String name;
private final String regexp;
private int flags = -1;
private int maxDeterminizedStates = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
private boolean maxDetermizedStatesSet;
private Boolean cache;
private String cacheKey;
@ -75,6 +78,15 @@ public class RegexpFilterBuilder extends BaseFilterBuilder {
return this;
}
/**
* Sets the regexp maxDeterminizedStates.
*/
public RegexpFilterBuilder maxDeterminizedStates(int value) {
this.maxDeterminizedStates = value;
this.maxDetermizedStatesSet = true;
return this;
}
/**
* Should the filter be cached or not. Defaults to <tt>false</tt>.
*/
@ -96,8 +108,11 @@ public class RegexpFilterBuilder extends BaseFilterBuilder {
} else {
builder.startObject(name)
.field("value", regexp)
.field("flags_value", flags)
.endObject();
.field("flags_value", flags);
if (maxDetermizedStatesSet) {
builder.field("max_determinized_states", maxDeterminizedStates);
}
builder.endObject();
}
if (filterName != null) {
@ -111,4 +126,4 @@ public class RegexpFilterBuilder extends BaseFilterBuilder {
}
builder.endObject();
}
}
}

View File

@ -21,6 +21,7 @@ package org.elasticsearch.index.query;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.lucene.search.RegexpFilter;
@ -62,6 +63,7 @@ public class RegexpFilterParser implements FilterParser {
String filterName = null;
String currentFieldName = null;
int maxDeterminizedStates = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
XContentParser.Token token;
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
@ -77,6 +79,8 @@ public class RegexpFilterParser implements FilterParser {
} else if ("flags".equals(currentFieldName)) {
String flags = parser.textOrNull();
flagsValue = RegexpFlag.resolveValue(flags);
} else if ("max_determinized_states".equals(currentFieldName)) {
maxDeterminizedStates = parser.intValue();
} else if ("flags_value".equals(currentFieldName)) {
flagsValue = parser.intValue();
} else {
@ -114,16 +118,16 @@ public class RegexpFilterParser implements FilterParser {
if (smartNameFieldMappers.explicitTypeInNameWithDocMapper()) {
String[] previousTypes = QueryParseContext.setTypesWithPrevious(new String[]{smartNameFieldMappers.docMapper().type()});
try {
filter = smartNameFieldMappers.mapper().regexpFilter(value, flagsValue, parseContext);
filter = smartNameFieldMappers.mapper().regexpFilter(value, flagsValue, maxDeterminizedStates, parseContext);
} finally {
QueryParseContext.setTypes(previousTypes);
}
} else {
filter = smartNameFieldMappers.mapper().regexpFilter(value, flagsValue, parseContext);
filter = smartNameFieldMappers.mapper().regexpFilter(value, flagsValue, maxDeterminizedStates, parseContext);
}
}
if (filter == null) {
filter = new RegexpFilter(new Term(fieldName, BytesRefs.toBytesRef(value)), flagsValue);
filter = new RegexpFilter(new Term(fieldName, BytesRefs.toBytesRef(value)), flagsValue, maxDeterminizedStates);
}
if (cache) {
@ -136,4 +140,4 @@ public class RegexpFilterParser implements FilterParser {
}
return filter;
}
}
}

View File

@ -19,6 +19,7 @@
package org.elasticsearch.index.query;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.common.xcontent.XContentBuilder;
import java.io.IOException;
@ -37,6 +38,8 @@ public class RegexpQueryBuilder extends BaseQueryBuilder implements BoostableQue
private float boost = -1;
private String rewrite;
private String queryName;
private int maxDeterminizedStates = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
private boolean maxDetermizedStatesSet;
/**
* Constructs a new term query.
@ -71,6 +74,15 @@ public class RegexpQueryBuilder extends BaseQueryBuilder implements BoostableQue
return this;
}
/**
* Sets the regexp maxDeterminizedStates.
*/
public RegexpQueryBuilder maxDeterminizedStates(int value) {
this.maxDeterminizedStates = value;
this.maxDetermizedStatesSet = true;
return this;
}
public RegexpQueryBuilder rewrite(String rewrite) {
this.rewrite = rewrite;
return this;
@ -95,6 +107,9 @@ public class RegexpQueryBuilder extends BaseQueryBuilder implements BoostableQue
if (flags != -1) {
builder.field("flags_value", flags);
}
if (maxDetermizedStatesSet) {
builder.field("max_determinized_states", maxDeterminizedStates);
}
if (boost != -1) {
builder.field("boost", boost);
}
@ -108,4 +123,4 @@ public class RegexpQueryBuilder extends BaseQueryBuilder implements BoostableQue
}
builder.endObject();
}
}
}

View File

@ -23,6 +23,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.lucene.BytesRefs;
@ -64,6 +65,7 @@ public class RegexpQueryParser implements QueryParser {
Object value = null;
float boost = 1.0f;
int flagsValue = -1;
int maxDeterminizedStates = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
String queryName = null;
token = parser.nextToken();
if (token == XContentParser.Token.START_OBJECT) {
@ -86,6 +88,8 @@ public class RegexpQueryParser implements QueryParser {
if (flagsValue < 0) {
flagsValue = RegExp.ALL;
}
} else if ("maxDeterminizedStates".equals(currentFieldName)) {
maxDeterminizedStates = parser.intValue();
} else if ("_name".equals(currentFieldName)) {
queryName = parser.text();
}
@ -111,16 +115,16 @@ public class RegexpQueryParser implements QueryParser {
if (smartNameFieldMappers.explicitTypeInNameWithDocMapper()) {
String[] previousTypes = QueryParseContext.setTypesWithPrevious(new String[]{smartNameFieldMappers.docMapper().type()});
try {
query = smartNameFieldMappers.mapper().regexpQuery(value, flagsValue, method, parseContext);
query = smartNameFieldMappers.mapper().regexpQuery(value, flagsValue, maxDeterminizedStates, method, parseContext);
} finally {
QueryParseContext.setTypes(previousTypes);
}
} else {
query = smartNameFieldMappers.mapper().regexpQuery(value, flagsValue, method, parseContext);
query = smartNameFieldMappers.mapper().regexpQuery(value, flagsValue, maxDeterminizedStates, method, parseContext);
}
}
if (query == null) {
RegexpQuery regexpQuery = new RegexpQuery(new Term(fieldName, BytesRefs.toBytesRef(value)), flagsValue);
RegexpQuery regexpQuery = new RegexpQuery(new Term(fieldName, BytesRefs.toBytesRef(value)), flagsValue, maxDeterminizedStates);
if (method != null) {
regexpQuery.setRewriteMethod(method);
}
@ -135,4 +139,4 @@ public class RegexpQueryParser implements QueryParser {
}
}
}

View File

@ -20,7 +20,6 @@
package org.elasticsearch.index.translog.fs;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.common.io.FileSystemUtils;
import java.io.File;
import java.io.FileNotFoundException;
@ -78,7 +77,7 @@ public class RafReference {
// ignore
} finally {
if (delete) {
FileSystemUtils.deleteFilesIgnoringExceptions(file.toPath());
IOUtils.deleteFilesIgnoringExceptions(file.toPath());
}
}

View File

@ -22,6 +22,7 @@ package org.elasticsearch.search.suggest.context;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester;
import org.apache.lucene.util.automaton.Automata;
@ -259,7 +260,10 @@ public abstract class ContextMapping implements ToXContent {
for (ContextQuery query : queries) {
a = Operations.concatenate(Arrays.asList(query.toAutomaton(), gap, a));
}
return Operations.determinize(a);
// TODO: should we limit this? Do any of our ContextQuery impls really create exponential regexps? GeoQuery looks safe (union
// of strings).
return Operations.determinize(a, Integer.MAX_VALUE);
}
/**

View File

@ -20,7 +20,6 @@ package org.elasticsearch.benchmark.fs;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.common.StopWatch;
import org.elasticsearch.common.io.FileSystemUtils;
import org.elasticsearch.common.unit.ByteSizeValue;
import java.io.File;
@ -36,7 +35,7 @@ import java.util.Random;
public class FsAppendBenchmark {
public static void main(String[] args) throws Exception {
FileSystemUtils.deleteFilesIgnoringExceptions(Paths.get("work/test.log"));
IOUtils.deleteFilesIgnoringExceptions(Paths.get("work/test.log"));
RandomAccessFile raf = new RandomAccessFile("work/test.log", "rw");
raf.setLength(0);

View File

@ -19,9 +19,9 @@
package org.elasticsearch.index.query;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.MultiFields;
@ -66,6 +66,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.action.termvector.MultiTermVectorsRequest;
@ -344,6 +345,29 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
}
}
@Test
public void testQueryStringRegexp() throws Exception {
IndexQueryParserService queryParser = queryParser();
String query = copyToStringFromClasspath("/org/elasticsearch/index/query/query-regexp-max-determinized-states.json");
Query parsedQuery = queryParser.parse(query).query();
assertThat(parsedQuery, instanceOf(RegexpQuery.class));
RegexpQuery regexpQuery = (RegexpQuery) parsedQuery;
assertTrue(regexpQuery.toString().contains("/foo*bar/"));
}
@Test
public void testQueryStringRegexpTooManyDeterminizedStates() throws Exception {
IndexQueryParserService queryParser = queryParser();
String query = copyToStringFromClasspath("/org/elasticsearch/index/query/query-regexp-too-many-determinized-states.json");
try {
queryParser.parse(query).query();
fail("did not hit exception");
} catch (QueryParsingException qpe) {
// expected
assertTrue(qpe.getCause() instanceof TooComplexToDeterminizeException);
}
}
@Test
public void testMatchAllBuilder() throws Exception {
IndexQueryParserService queryParser = queryParser();
@ -674,6 +698,16 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
assertThat(regexpQuery.getField(), equalTo("name.first"));
}
@Test
public void testRegexpQueryWithMaxDeterminizedStates() throws IOException {
IndexQueryParserService queryParser = queryParser();
String query = copyToStringFromClasspath("/org/elasticsearch/index/query/regexp-max-determinized-states.json");
Query parsedQuery = queryParser.parse(query).query();
assertThat(parsedQuery, instanceOf(RegexpQuery.class));
RegexpQuery regexpQuery = (RegexpQuery) parsedQuery;
assertThat(regexpQuery.getField(), equalTo("name.first"));
}
@Test
public void testRegexpFilteredQuery() throws IOException {
IndexQueryParserService queryParser = queryParser();
@ -687,6 +721,19 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
assertThat(regexpFilter.regexp(), equalTo("s.*y"));
}
@Test
public void testRegexpFilteredQueryWithMaxDeterminizedStates() throws IOException {
IndexQueryParserService queryParser = queryParser();
String query = copyToStringFromClasspath("/org/elasticsearch/index/query/regexp-filter-max-determinized-states.json");
Query parsedQuery = queryParser.parse(query).query();
assertThat(parsedQuery, instanceOf(FilteredQuery.class));
Filter filter = ((FilteredQuery) parsedQuery).getFilter();
assertThat(filter, instanceOf(RegexpFilter.class));
RegexpFilter regexpFilter = (RegexpFilter) filter;
assertThat(regexpFilter.field(), equalTo("name.first"));
assertThat(regexpFilter.regexp(), equalTo("s.*y"));
}
@Test
public void testNamedRegexpFilteredQuery() throws IOException {
IndexQueryParserService queryParser = queryParser();

View File

@ -0,0 +1,7 @@
{
query_string: {
default_field: "content",
query:"/foo*bar/",
max_determinized_states: 5000
}
}

View File

@ -0,0 +1,6 @@
{
query_string: {
default_field: "content",
query: "/[ac]*a[ac]{50,200}/"
}
}

View File

@ -0,0 +1,17 @@
{
"filtered": {
"query": {
"term": {
"name.first": "shay"
}
},
"filter": {
"regexp": {
"name.first": {
"value": "s.*y",
"max_determinized_states": 6000
}
}
}
}
}

View File

@ -0,0 +1,6 @@
{
"regexp": {
"name.first": "s.*y",
"max_determinized_states": 5000
}
}

View File

@ -46,7 +46,6 @@ import org.elasticsearch.cluster.metadata.MappingMetaData;
import org.elasticsearch.cluster.metadata.SnapshotMetaData;
import org.elasticsearch.cluster.routing.allocation.decider.FilterAllocationDecider;
import org.elasticsearch.common.collect.ImmutableOpenMap;
import org.elasticsearch.common.io.FileSystemUtils;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.store.support.AbstractIndexStore;
@ -735,8 +734,8 @@ public class SharedClusterSnapshotRestoreTests extends AbstractSnapshotTests {
File testIndex1 = new File(indices, "test-idx-1");
File testIndex2 = new File(indices, "test-idx-2");
File testIndex2Shard0 = new File(testIndex2, "0");
FileSystemUtils.deleteFilesIgnoringExceptions(new File(testIndex1, "snapshot-test-snap-1").toPath());
FileSystemUtils.deleteFilesIgnoringExceptions(new File(testIndex2Shard0, "snapshot-test-snap-1").toPath());
IOUtils.deleteFilesIgnoringExceptions(new File(testIndex1, "snapshot-test-snap-1").toPath());
IOUtils.deleteFilesIgnoringExceptions(new File(testIndex2Shard0, "snapshot-test-snap-1").toPath());
logger.info("--> delete snapshot");
client.admin().cluster().prepareDeleteSnapshot("test-repo", "test-snap-1").get();