From 8b56441d2b00bb80f3e8452f7886b295abce0d86 Mon Sep 17 00:00:00 2001 From: markharwood Date: Tue, 25 Aug 2020 17:18:59 +0100 Subject: [PATCH] Search - add case insensitive support for regex queries. (#59441) (#61532) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport to add case insensitive support for regex queries. Forks a copy of Lucene’s RegexpQuery and RegExp from Lucene master. This can be removed when 8.7 Lucene is released. Closes #59235 --- .../reference/query-dsl/regexp-query.asciidoc | 5 + .../ICUCollationKeywordFieldMapper.java | 2 +- .../index/mapper/CollationFieldTypeTests.java | 2 +- server/build.gradle | 5 +- .../org/apache/lucene/search/RegExp87.java | 1074 +++++++++++++++++ .../apache/lucene/search/RegexpQuery87.java | 143 +++ .../index/mapper/MappedFieldType.java | 4 +- .../index/mapper/StringFieldType.java | 7 +- .../elasticsearch/index/query/RegexpFlag.java | 20 +- .../index/query/RegexpQueryBuilder.java | 71 +- .../index/search/QueryStringQueryParser.java | 4 +- .../suggest/completion/RegexOptions.java | 4 +- .../mapper/CompletionFieldMapperTests.java | 4 +- .../index/mapper/IgnoredFieldTypeTests.java | 8 +- .../index/mapper/IndexFieldTypeTests.java | 2 +- .../index/mapper/KeywordFieldTypeTests.java | 10 +- .../index/mapper/RoutingFieldTypeTests.java | 8 +- .../index/mapper/TextFieldTypeTests.java | 10 +- .../query/QueryStringQueryBuilderTests.java | 6 +- .../index/query/RegexpQueryBuilderTests.java | 35 +- .../mapper/ConstantKeywordFieldMapper.java | 6 +- .../mapper/ConstantKeywordFieldTypeTests.java | 8 +- .../mapper/FlatObjectFieldMapper.java | 2 +- .../mapper/KeyedFlatObjectFieldTypeTests.java | 2 +- .../mapper/RootFlatObjectFieldTypeTests.java | 8 +- .../test/wildcard/10_wildcard_basic.yml | 17 + .../wildcard/mapper/WildcardFieldMapper.java | 19 +- .../mapper/WildcardFieldMapperTests.java | 21 +- 28 files changed, 1411 insertions(+), 96 deletions(-) create mode 100644 server/src/main/java/org/apache/lucene/search/RegExp87.java create mode 100644 server/src/main/java/org/apache/lucene/search/RegexpQuery87.java diff --git a/docs/reference/query-dsl/regexp-query.asciidoc b/docs/reference/query-dsl/regexp-query.asciidoc index 5b9e4ea8a22..a0ead3ea9df 100644 --- a/docs/reference/query-dsl/regexp-query.asciidoc +++ b/docs/reference/query-dsl/regexp-query.asciidoc @@ -28,6 +28,7 @@ GET /_search "user.id": { "value": "k.*y", "flags": "ALL", + "case_insensitive": true, "max_determinized_states": 10000, "rewrite": "constant_score" } @@ -67,6 +68,10 @@ provided. To improve performance, avoid using wildcard patterns, such as `.*` or valid values and more information, see <>. +`case_insensitive`:: +(Optional, boolean) allows case insensitive matching of the regular expression +value with the indexed field values when set to true. Setting to false is disallowed. + `max_determinized_states`:: + -- diff --git a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapper.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapper.java index c53502deaf3..e3c05781a0e 100644 --- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapper.java +++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapper.java @@ -146,7 +146,7 @@ public class ICUCollationKeywordFieldMapper extends FieldMapper { } @Override - public Query regexpQuery(String value, int flags, int maxDeterminizedStates, + public Query regexpQuery(String value, int syntaxFlags, int matchFlags, int maxDeterminizedStates, MultiTermQuery.RewriteMethod method, QueryShardContext context) { throw new UnsupportedOperationException("[regexp] queries are not supported on [" + CONTENT_TYPE + "] fields."); } diff --git a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/CollationFieldTypeTests.java b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/CollationFieldTypeTests.java index c1c86bbb8f8..f5f225950fe 100644 --- a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/CollationFieldTypeTests.java +++ b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/CollationFieldTypeTests.java @@ -91,7 +91,7 @@ public class CollationFieldTypeTests extends FieldTypeTestCase{ public void testRegexpQuery() { MappedFieldType ft = new CollationFieldType("field", DEFAULT_COLLATOR); UnsupportedOperationException e = expectThrows(UnsupportedOperationException.class, - () -> ft.regexpQuery("foo.*", 0, 10, null, randomMockShardContext())); + () -> ft.regexpQuery("foo.*", 0, 0, 10, null, randomMockShardContext())); assertEquals("[regexp] queries are not supported on [icu_collation_keyword] fields.", e.getMessage()); } diff --git a/server/build.gradle b/server/build.gradle index 59c5704fae7..48ff7814b9e 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -333,5 +333,8 @@ tasks.named("dependencyLicenses").configure { tasks.named("licenseHeaders").configure { // Ignore our vendored version of Google Guice excludes << 'org/elasticsearch/common/inject/**/*' + // Ignore temporary copies of impending 8.7 Lucene classes + excludes << 'org/apache/lucene/search/RegExp87*' + excludes << 'org/apache/lucene/search/RegexpQuery87*' excludes << 'org/elasticsearch/client/documentation/placeholder.txt' -} \ No newline at end of file +} diff --git a/server/src/main/java/org/apache/lucene/search/RegExp87.java b/server/src/main/java/org/apache/lucene/search/RegExp87.java new file mode 100644 index 00000000000..00d77efbacf --- /dev/null +++ b/server/src/main/java/org/apache/lucene/search/RegExp87.java @@ -0,0 +1,1074 @@ +/* + * dk.brics.automaton + * + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.apache.lucene.search; + +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.AutomatonProvider; +import org.apache.lucene.util.automaton.MinimizationOperations; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.RegExp; +import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +/** + * Copy of Lucene 8.7's forthcoming RegExp class brought forward for + * case insensitive search feature. + * + * @deprecated Use the RegExp object coming in Lucene 8.7 when it ships + */ +@Deprecated +public class RegExp87 { + + /** + * The type of expression represented by a RegExp node. + */ + public enum Kind { + /** The union of two expressions */ + REGEXP_UNION, + /** A sequence of two expressions */ + REGEXP_CONCATENATION, + /** The intersection of two expressions */ + REGEXP_INTERSECTION, + /** An optional expression */ + REGEXP_OPTIONAL, + /** An expression that repeats */ + REGEXP_REPEAT, + /** An expression that repeats a minimum number of times*/ + REGEXP_REPEAT_MIN, + /** An expression that repeats a minimum and maximum number of times*/ + REGEXP_REPEAT_MINMAX, + /** The complement of an expression */ + REGEXP_COMPLEMENT, + /** A Character */ + REGEXP_CHAR, + /** A Character range*/ + REGEXP_CHAR_RANGE, + /** Any Character allowed*/ + REGEXP_ANYCHAR, + /** An empty expression*/ + REGEXP_EMPTY, + /** A string expression*/ + REGEXP_STRING, + /** Any string allowed */ + REGEXP_ANYSTRING, + /** An Automaton expression*/ + REGEXP_AUTOMATON, + /** An Interval expression */ + REGEXP_INTERVAL, + /** An expression for a pre-defined class e.g. \w */ + REGEXP_PRE_CLASS + } + + //----- Syntax flags ( <= 0xff ) ------ + /** + * Syntax flag, enables intersection (&). + */ + public static final int INTERSECTION = 0x0001; + + /** + * Syntax flag, enables complement (~). + */ + public static final int COMPLEMENT = 0x0002; + + /** + * Syntax flag, enables empty language (#). + */ + public static final int EMPTY = 0x0004; + + /** + * Syntax flag, enables anystring (@). + */ + public static final int ANYSTRING = 0x0008; + + /** + * Syntax flag, enables named automata (<identifier>). + */ + public static final int AUTOMATON = 0x0010; + + /** + * Syntax flag, enables numerical intervals ( + * <n-m>). + */ + public static final int INTERVAL = 0x0020; + + /** + * Syntax flag, enables all optional regexp syntax. + */ + public static final int ALL = 0xff; + + /** + * Syntax flag, enables no optional regexp syntax. + */ + public static final int NONE = 0x0000; + + //----- Matching flags ( > 0xff ) ------ + + /** + * Allows case insensitive matching of ASCII characters. + */ + public static final int ASCII_CASE_INSENSITIVE = 0x0100; + + //Immutable parsed state + /** + * The type of expression + */ + public final Kind kind; + /** + * Child expressions held by a container type expression + */ + public final RegExp87 exp1, exp2; + /** + * String expression + */ + public final String s; + /** + * Character expression + */ + public final int c; + /** + * Limits for repeatable type expressions + */ + public final int min, max, digits; + /** + * Extents for range type expressions + */ + public final int from, to; + + // Parser variables + private final String originalString; + final int flags; + int pos; + + /** + * Constructs new RegExp from a string. Same as + * RegExp(s, ALL). + * + * @param s regexp string + * @exception IllegalArgumentException if an error occurred while parsing the + * regular expression + */ + public RegExp87(String s) throws IllegalArgumentException { + this(s, ALL); + } + + /** + * Constructs new RegExp from a string. + * + * @param s regexp string + * @param syntax_flags boolean 'or' of optional syntax constructs to be + * enabled + * @exception IllegalArgumentException if an error occurred while parsing the + * regular expression + */ + public RegExp87(String s, int syntax_flags) throws IllegalArgumentException { + this(s, syntax_flags, 0); + } + /** + * Constructs new RegExp from a string. + * + * @param s regexp string + * @param syntax_flags boolean 'or' of optional syntax constructs to be + * enabled + * @param match_flags boolean 'or' of match behavior options such as case insensitivity + * @exception IllegalArgumentException if an error occurred while parsing the + * regular expression + */ + public RegExp87(String s, int syntax_flags, int match_flags) throws IllegalArgumentException { + if (syntax_flags > ALL) { + throw new IllegalArgumentException("Illegal syntax flag"); + } + + if (match_flags > 0 && match_flags <= ALL) { + throw new IllegalArgumentException("Illegal match flag"); + } + flags = syntax_flags | match_flags; + originalString = s; + RegExp87 e; + if (s.length() == 0) e = makeString(flags, ""); + else { + e = parseUnionExp(); + if (pos < originalString.length()) throw new IllegalArgumentException( + "end-of-string expected at position " + pos); + } + kind = e.kind; + exp1 = e.exp1; + exp2 = e.exp2; + this.s = e.s; + c = e.c; + min = e.min; + max = e.max; + digits = e.digits; + from = e.from; + to = e.to; + } + + RegExp87(int flags, Kind kind, RegExp87 exp1, RegExp87 exp2, String s, int c, int min, int max, int digits, int from, int to){ + this.originalString = null; + this.kind = kind; + this.flags = flags; + this.exp1 = exp1; + this.exp2 = exp2; + this.s = s; + this.c = c; + this.min = min; + this.max = max; + this.digits = digits; + this.from = from; + this.to = to; + } + + // Simplified construction of container nodes + static RegExp87 newContainerNode(int flags, Kind kind, RegExp87 exp1, RegExp87 exp2) { + return new RegExp87(flags, kind, exp1, exp2, null, 0, 0, 0, 0, 0, 0); + } + + // Simplified construction of repeating nodes + static RegExp87 newRepeatingNode(int flags, Kind kind, RegExp87 exp, int min, int max) { + return new RegExp87(flags, kind, exp, null, null, 0, min, max, 0, 0, 0); + } + + + // Simplified construction of leaf nodes + static RegExp87 newLeafNode(int flags, Kind kind, String s, int c, int min, int max, int digits, int from, int to) { + return new RegExp87(flags, kind, null, null, s, c, min, max, digits, from, to); + } + + /** + * Constructs new Automaton from this RegExp. Same + * as toAutomaton(null) (empty automaton map). + */ + public Automaton toAutomaton() { + return toAutomaton(null, null, Operations.DEFAULT_MAX_DETERMINIZED_STATES); + } + + /** + * Constructs new Automaton from this RegExp. The + * constructed automaton is minimal and deterministic and has no transitions + * to dead states. + * + * @param maxDeterminizedStates maximum number of states in the resulting + * automata. If the automata would need more than this many states + * TooComplextToDeterminizeException is thrown. Higher number require more + * space but can process more complex regexes. + * @exception IllegalArgumentException if this regular expression uses a named + * identifier that is not available from the automaton provider + * @exception TooComplexToDeterminizeException if determinizing this regexp + * requires more than maxDeterminizedStates states + */ + public Automaton toAutomaton(int maxDeterminizedStates) + throws IllegalArgumentException, TooComplexToDeterminizeException { + return toAutomaton(null, null, maxDeterminizedStates); + } + + /** + * Constructs new Automaton from this RegExp. The + * constructed automaton is minimal and deterministic and has no transitions + * to dead states. + * + * @param automaton_provider provider of automata for named identifiers + * @param maxDeterminizedStates maximum number of states in the resulting + * automata. If the automata would need more than this many states + * TooComplextToDeterminizeException is thrown. Higher number require more + * space but can process more complex regexes. + * @exception IllegalArgumentException if this regular expression uses a named + * identifier that is not available from the automaton provider + * @exception TooComplexToDeterminizeException if determinizing this regexp + * requires more than maxDeterminizedStates states + */ + public Automaton toAutomaton(AutomatonProvider automaton_provider, + int maxDeterminizedStates) throws IllegalArgumentException, + TooComplexToDeterminizeException { + return toAutomaton(null, automaton_provider, maxDeterminizedStates); + } + + /** + * Constructs new Automaton from this RegExp. The + * constructed automaton is minimal and deterministic and has no transitions + * to dead states. + * + * @param automata a map from automaton identifiers to automata (of type + * Automaton). + * @param maxDeterminizedStates maximum number of states in the resulting + * automata. If the automata would need more than this many states + * TooComplexToDeterminizeException is thrown. Higher number require more + * space but can process more complex regexes. + * @exception IllegalArgumentException if this regular expression uses a named + * identifier that does not occur in the automaton map + * @exception TooComplexToDeterminizeException if determinizing this regexp + * requires more than maxDeterminizedStates states + */ + public Automaton toAutomaton(Map automata, + int maxDeterminizedStates) throws IllegalArgumentException, + TooComplexToDeterminizeException { + return toAutomaton(automata, null, maxDeterminizedStates); + } + + private Automaton toAutomaton(Map automata, + AutomatonProvider automaton_provider, int maxDeterminizedStates) + throws IllegalArgumentException, TooComplexToDeterminizeException { + try { + return toAutomatonInternal(automata, automaton_provider, + maxDeterminizedStates); + } catch (TooComplexToDeterminizeException e) { + // This is a little ugly. Have to pass an instance of core Lucene RegExp just to get error message. + throw new TooComplexToDeterminizeException(new RegExp(this.originalString), e); + } + } + + private Automaton toAutomatonInternal(Map automata, + AutomatonProvider automaton_provider, int maxDeterminizedStates) + throws IllegalArgumentException { + List list; + Automaton a = null; + switch (kind) { + case REGEXP_PRE_CLASS: + RegExp87 expanded = expandPredefined(); + a = expanded.toAutomatonInternal(automata, automaton_provider, maxDeterminizedStates); + break; + case REGEXP_UNION: + list = new ArrayList<>(); + findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider, + maxDeterminizedStates); + findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider, + maxDeterminizedStates); + a = Operations.union(list); + a = MinimizationOperations.minimize(a, maxDeterminizedStates); + break; + case REGEXP_CONCATENATION: + list = new ArrayList<>(); + findLeaves(exp1, Kind.REGEXP_CONCATENATION, list, automata, + automaton_provider, maxDeterminizedStates); + findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata, + automaton_provider, maxDeterminizedStates); + a = Operations.concatenate(list); + a = MinimizationOperations.minimize(a, maxDeterminizedStates); + break; + case REGEXP_INTERSECTION: + a = Operations.intersection( + exp1.toAutomatonInternal( + automata, automaton_provider, maxDeterminizedStates), + exp2.toAutomatonInternal( + automata, automaton_provider, maxDeterminizedStates)); + a = MinimizationOperations.minimize(a, maxDeterminizedStates); + break; + case REGEXP_OPTIONAL: + a = Operations.optional(exp1.toAutomatonInternal(automata, + automaton_provider, maxDeterminizedStates)); + a = MinimizationOperations.minimize(a, maxDeterminizedStates); + break; + case REGEXP_REPEAT: + a = Operations.repeat(exp1.toAutomatonInternal( + automata, automaton_provider, maxDeterminizedStates)); + a = MinimizationOperations.minimize(a, maxDeterminizedStates); + break; + case REGEXP_REPEAT_MIN: + a = exp1.toAutomatonInternal(automata, automaton_provider, maxDeterminizedStates); + int minNumStates = (a.getNumStates() - 1) * min; + if (minNumStates > maxDeterminizedStates) { + throw new TooComplexToDeterminizeException(a, minNumStates); + } + a = Operations.repeat(a, min); + a = MinimizationOperations.minimize(a, maxDeterminizedStates); + break; + case REGEXP_REPEAT_MINMAX: + a = exp1.toAutomatonInternal(automata, automaton_provider, maxDeterminizedStates); + int minMaxNumStates = (a.getNumStates() - 1) * max; + if (minMaxNumStates > maxDeterminizedStates) { + throw new TooComplexToDeterminizeException(a, minMaxNumStates); + } + a = Operations.repeat(a, min, max); + break; + case REGEXP_COMPLEMENT: + a = Operations.complement( + exp1.toAutomatonInternal(automata, automaton_provider, + maxDeterminizedStates), + maxDeterminizedStates); + a = MinimizationOperations.minimize(a, maxDeterminizedStates); + break; + case REGEXP_CHAR: + if (check(ASCII_CASE_INSENSITIVE)) { + a = toCaseInsensitiveChar(c, maxDeterminizedStates); + } else { + a = Automata.makeChar(c); + } + break; + case REGEXP_CHAR_RANGE: + a = Automata.makeCharRange(from, to); + break; + case REGEXP_ANYCHAR: + a = Automata.makeAnyChar(); + break; + case REGEXP_EMPTY: + a = Automata.makeEmpty(); + break; + case REGEXP_STRING: + if (check(ASCII_CASE_INSENSITIVE)) { + a = toCaseInsensitiveString(maxDeterminizedStates); + } else { + a = Automata.makeString(s); + } + break; + case REGEXP_ANYSTRING: + a = Automata.makeAnyString(); + break; + case REGEXP_AUTOMATON: + Automaton aa = null; + if (automata != null) { + aa = automata.get(s); + } + if (aa == null && automaton_provider != null) { + try { + aa = automaton_provider.getAutomaton(s); + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + } + if (aa == null) { + throw new IllegalArgumentException("'" + s + "' not found"); + } + a = aa; + break; + case REGEXP_INTERVAL: + a = Automata.makeDecimalInterval(min, max, digits); + break; + } + return a; + } + private Automaton toCaseInsensitiveChar(int codepoint, int maxDeterminizedStates) { + Automaton case1 = Automata.makeChar(codepoint); + // For now we only work with ASCII characters + if (codepoint > 128) { + return case1; + } + int altCase = Character.isLowerCase(codepoint) ? Character.toUpperCase(codepoint) : Character.toLowerCase(codepoint); + Automaton result; + if (altCase != codepoint) { + result = Operations.union(case1, Automata.makeChar(altCase)); + result = MinimizationOperations.minimize(result, maxDeterminizedStates); + } else { + result = case1; + } + return result; + } + + private Automaton toCaseInsensitiveString(int maxDeterminizedStates) { + List list = new ArrayList<>(); + + Iterator iter = s.codePoints().iterator(); + while (iter.hasNext()) { + list.add(toCaseInsensitiveChar(iter.next(), maxDeterminizedStates)); + } + Automaton a = Operations.concatenate(list); + a = MinimizationOperations.minimize(a, maxDeterminizedStates); + return a; + } + + private void findLeaves(RegExp87 exp, Kind kind, List list, + Map automata, AutomatonProvider automaton_provider, + int maxDeterminizedStates) { + if (exp.kind == kind) { + findLeaves(exp.exp1, kind, list, automata, automaton_provider, + maxDeterminizedStates); + findLeaves(exp.exp2, kind, list, automata, automaton_provider, + maxDeterminizedStates); + } else { + list.add(exp.toAutomatonInternal(automata, automaton_provider, + maxDeterminizedStates)); + } + } + + /** + * The string that was used to construct the regex. Compare to toString. + */ + public String getOriginalString() { + return originalString; + } + + /** + * Constructs string from parsed regular expression. + */ + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + toStringBuilder(b); + return b.toString(); + } + + void toStringBuilder(StringBuilder b) { + switch (kind) { + case REGEXP_UNION: + b.append("("); + exp1.toStringBuilder(b); + b.append("|"); + exp2.toStringBuilder(b); + b.append(")"); + break; + case REGEXP_CONCATENATION: + exp1.toStringBuilder(b); + exp2.toStringBuilder(b); + break; + case REGEXP_INTERSECTION: + b.append("("); + exp1.toStringBuilder(b); + b.append("&"); + exp2.toStringBuilder(b); + b.append(")"); + break; + case REGEXP_OPTIONAL: + b.append("("); + exp1.toStringBuilder(b); + b.append(")?"); + break; + case REGEXP_REPEAT: + b.append("("); + exp1.toStringBuilder(b); + b.append(")*"); + break; + case REGEXP_REPEAT_MIN: + b.append("("); + exp1.toStringBuilder(b); + b.append("){").append(min).append(",}"); + break; + case REGEXP_REPEAT_MINMAX: + b.append("("); + exp1.toStringBuilder(b); + b.append("){").append(min).append(",").append(max).append("}"); + break; + case REGEXP_COMPLEMENT: + b.append("~("); + exp1.toStringBuilder(b); + b.append(")"); + break; + case REGEXP_CHAR: + b.append("\\").appendCodePoint(c); + break; + case REGEXP_CHAR_RANGE: + b.append("[\\").appendCodePoint(from).append("-\\").appendCodePoint(to).append("]"); + break; + case REGEXP_ANYCHAR: + b.append("."); + break; + case REGEXP_EMPTY: + b.append("#"); + break; + case REGEXP_STRING: + b.append("\"").append(s).append("\""); + break; + case REGEXP_ANYSTRING: + b.append("@"); + break; + case REGEXP_AUTOMATON: + b.append("<").append(s).append(">"); + break; + case REGEXP_INTERVAL: + String s1 = Integer.toString(min); + String s2 = Integer.toString(max); + b.append("<"); + if (digits > 0) for (int i = s1.length(); i < digits; i++) + b.append('0'); + b.append(s1).append("-"); + if (digits > 0) for (int i = s2.length(); i < digits; i++) + b.append('0'); + b.append(s2).append(">"); + break; + case REGEXP_PRE_CLASS: + b.append("\\").appendCodePoint(from); + break; + } + } + + /** + * Like to string, but more verbose (shows the higherchy more clearly). + */ + public String toStringTree() { + StringBuilder b = new StringBuilder(); + toStringTree(b, ""); + return b.toString(); + } + + void toStringTree(StringBuilder b, String indent) { + switch (kind) { + // binary + case REGEXP_UNION: + case REGEXP_CONCATENATION: + case REGEXP_INTERSECTION: + b.append(indent); + b.append(kind); + b.append('\n'); + exp1.toStringTree(b, indent + " "); + exp2.toStringTree(b, indent + " "); + break; + // unary + case REGEXP_OPTIONAL: + case REGEXP_REPEAT: + case REGEXP_COMPLEMENT: + b.append(indent); + b.append(kind); + b.append('\n'); + exp1.toStringTree(b, indent + " "); + break; + case REGEXP_REPEAT_MIN: + b.append(indent); + b.append(kind); + b.append(" min="); + b.append(min); + b.append('\n'); + exp1.toStringTree(b, indent + " "); + break; + case REGEXP_REPEAT_MINMAX: + b.append(indent); + b.append(kind); + b.append(" min="); + b.append(min); + b.append(" max="); + b.append(max); + b.append('\n'); + exp1.toStringTree(b, indent + " "); + break; + case REGEXP_CHAR: + b.append(indent); + b.append(kind); + b.append(" char="); + b.appendCodePoint(c); + b.append('\n'); + break; + case REGEXP_PRE_CLASS: + b.append(indent); + b.append(kind); + b.append(" class=\\"); + b.appendCodePoint(from); + b.append('\n'); + break; + case REGEXP_CHAR_RANGE: + b.append(indent); + b.append(kind); + b.append(" from="); + b.appendCodePoint(from); + b.append(" to="); + b.appendCodePoint(to); + b.append('\n'); + break; + case REGEXP_ANYCHAR: + case REGEXP_EMPTY: + b.append(indent); + b.append(kind); + b.append('\n'); + break; + case REGEXP_STRING: + b.append(indent); + b.append(kind); + b.append(" string="); + b.append(s); + b.append('\n'); + break; + case REGEXP_ANYSTRING: + b.append(indent); + b.append(kind); + b.append('\n'); + break; + case REGEXP_AUTOMATON: + b.append(indent); + b.append(kind); + b.append('\n'); + break; + case REGEXP_INTERVAL: + b.append(indent); + b.append(kind); + String s1 = Integer.toString(min); + String s2 = Integer.toString(max); + b.append("<"); + if (digits > 0) for (int i = s1.length(); i < digits; i++) + b.append('0'); + b.append(s1).append("-"); + if (digits > 0) for (int i = s2.length(); i < digits; i++) + b.append('0'); + b.append(s2).append(">"); + b.append('\n'); + break; + } + } + + /** + * Returns set of automaton identifiers that occur in this regular expression. + */ + public Set getIdentifiers() { + HashSet set = new HashSet<>(); + getIdentifiers(set); + return set; + } + + void getIdentifiers(Set set) { + switch (kind) { + case REGEXP_UNION: + case REGEXP_CONCATENATION: + case REGEXP_INTERSECTION: + exp1.getIdentifiers(set); + exp2.getIdentifiers(set); + break; + case REGEXP_OPTIONAL: + case REGEXP_REPEAT: + case REGEXP_REPEAT_MIN: + case REGEXP_REPEAT_MINMAX: + case REGEXP_COMPLEMENT: + exp1.getIdentifiers(set); + break; + case REGEXP_AUTOMATON: + set.add(s); + break; + default: + } + } + + static RegExp87 makeUnion(int flags, RegExp87 exp1, RegExp87 exp2) { + return newContainerNode(flags, Kind.REGEXP_UNION, exp1, exp2); + } + + static RegExp87 makeConcatenation(int flags, RegExp87 exp1, RegExp87 exp2) { + if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING) + && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString( + flags, exp1, exp2); + RegExp87 rexp1, rexp2; + if (exp1.kind == Kind.REGEXP_CONCATENATION + && (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING) + && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) { + rexp1 = exp1.exp1; + rexp2 = makeString(flags, exp1.exp2, exp2); + } else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING) + && exp2.kind == Kind.REGEXP_CONCATENATION + && (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING)) { + rexp1 = makeString(flags, exp1, exp2.exp1); + rexp2 = exp2.exp2; + } else { + rexp1 = exp1; + rexp2 = exp2; + } + return newContainerNode(flags, Kind.REGEXP_CONCATENATION, rexp1, rexp2); + } + + private static RegExp87 makeString(int flags, RegExp87 exp1, RegExp87 exp2) { + StringBuilder b = new StringBuilder(); + if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s); + else b.appendCodePoint(exp1.c); + if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s); + else b.appendCodePoint(exp2.c); + return makeString(flags, b.toString()); + } + + static RegExp87 makeIntersection(int flags, RegExp87 exp1, RegExp87 exp2) { + return newContainerNode(flags, Kind.REGEXP_INTERSECTION, exp1, exp2); + } + + static RegExp87 makeOptional(int flags, RegExp87 exp) { + return newContainerNode(flags, Kind.REGEXP_OPTIONAL, exp, null); + } + + static RegExp87 makeRepeat(int flags, RegExp87 exp) { + return newContainerNode(flags, Kind.REGEXP_REPEAT, exp, null); + } + + static RegExp87 makeRepeat(int flags, RegExp87 exp, int min) { + return newRepeatingNode(flags, Kind.REGEXP_REPEAT_MIN, exp, min, 0); + } + + static RegExp87 makeRepeat(int flags, RegExp87 exp, int min, int max) { + return newRepeatingNode(flags, Kind.REGEXP_REPEAT_MINMAX, exp, min, max); + } + + static RegExp87 makeComplement(int flags, RegExp87 exp) { + return newContainerNode(flags, Kind.REGEXP_COMPLEMENT, exp, null); + } + + static RegExp87 makeChar(int flags, int c) { + return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0); + } + + static RegExp87 makeCharRange(int flags, int from, int to) { + if (from > to) + throw new IllegalArgumentException("invalid range: from (" + from + ") cannot be > to (" + to + ")"); + return newLeafNode(flags, Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to); + } + + static RegExp87 makeAnyChar(int flags) { + return newContainerNode(flags, Kind.REGEXP_ANYCHAR, null, null); + } + + static RegExp87 makeEmpty(int flags) { + return newContainerNode(flags, Kind.REGEXP_EMPTY, null, null); + } + + static RegExp87 makeString(int flags, String s) { + return newLeafNode(flags, Kind.REGEXP_STRING, s, 0, 0, 0, 0, 0, 0); + } + + static RegExp87 makeAnyString(int flags) { + return newContainerNode(flags, Kind.REGEXP_ANYSTRING, null, null); + } + + static RegExp87 makeAutomaton(int flags, String s) { + return newLeafNode(flags, Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, 0, 0); + } + + static RegExp87 makeInterval(int flags, int min, int max, int digits) { + return newLeafNode(flags, Kind.REGEXP_INTERVAL, null, 0, min, max, digits, 0, 0); + } + + private boolean peek(String s) { + return more() && s.indexOf(originalString.codePointAt(pos)) != -1; + } + + private boolean match(int c) { + if (pos >= originalString.length()) return false; + if (originalString.codePointAt(pos) == c) { + pos += Character.charCount(c); + return true; + } + return false; + } + + private boolean more() { + return pos < originalString.length(); + } + + private int next() throws IllegalArgumentException { + if (!more()) throw new IllegalArgumentException("unexpected end-of-string"); + int ch = originalString.codePointAt(pos); + pos += Character.charCount(ch); + return ch; + } + + private boolean check(int flag) { + return (flags & flag) != 0; + } + + final RegExp87 parseUnionExp() throws IllegalArgumentException { + RegExp87 e = parseInterExp(); + if (match('|')) e = makeUnion(flags, e, parseUnionExp()); + return e; + } + + final RegExp87 parseInterExp() throws IllegalArgumentException { + RegExp87 e = parseConcatExp(); + if (check(INTERSECTION) && match('&')) e = makeIntersection(flags, e, + parseInterExp()); + return e; + } + + final RegExp87 parseConcatExp() throws IllegalArgumentException { + RegExp87 e = parseRepeatExp(); + if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation( + flags, e, parseConcatExp()); + return e; + } + + final RegExp87 parseRepeatExp() throws IllegalArgumentException { + RegExp87 e = parseComplExp(); + while (peek("?*+{")) { + if (match('?')) e = makeOptional(flags, e); + else if (match('*')) e = makeRepeat(flags, e); + else if (match('+')) e = makeRepeat(flags, e, 1); + else if (match('{')) { + int start = pos; + while (peek("0123456789")) + next(); + if (start == pos) throw new IllegalArgumentException( + "integer expected at position " + pos); + int n = Integer.parseInt(originalString.substring(start, pos)); + int m = -1; + if (match(',')) { + start = pos; + while (peek("0123456789")) + next(); + if (start != pos) m = Integer.parseInt( + originalString.substring(start, pos)); + } else m = n; + if (!match('}')) throw new IllegalArgumentException( + "expected '}' at position " + pos); + if (m == -1) e = makeRepeat(flags, e, n); + else e = makeRepeat(flags, e, n, m); + } + } + return e; + } + + final RegExp87 parseComplExp() throws IllegalArgumentException { + if (check(COMPLEMENT) && match('~')) return makeComplement(flags, parseComplExp()); + else return parseCharClassExp(); + } + + final RegExp87 parseCharClassExp() throws IllegalArgumentException { + if (match('[')) { + boolean negate = false; + if (match('^')) negate = true; + RegExp87 e = parseCharClasses(); + if (negate) e = makeIntersection(flags, makeAnyChar(flags), makeComplement(flags, e)); + if (!match(']')) throw new IllegalArgumentException( + "expected ']' at position " + pos); + return e; + } else return parseSimpleExp(); + } + + final RegExp87 parseCharClasses() throws IllegalArgumentException { + RegExp87 e = parseCharClass(); + while (more() && !peek("]")) + e = makeUnion(flags, e, parseCharClass()); + return e; + } + + final RegExp87 parseCharClass() throws IllegalArgumentException { + RegExp87 predefinedExp = matchPredefinedCharacterClass(); + if (predefinedExp != null) { + return predefinedExp; + } + + int c = parseCharExp(); + if (match('-')) return makeCharRange(flags, c, parseCharExp()); + else return makeChar(flags, c); + } + + RegExp87 expandPredefined() { + //See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html + switch (from) { + case 'd': + return new RegExp87("[0-9]"); // digit + case 'D': + return new RegExp87("[^0-9]"); // non-digit + case 's': + return new RegExp87("[ \t\n\r]"); // whitespace + case 'S': + return new RegExp87("[^\\s]"); // non-whitespace + case 'w': + return new RegExp87("[a-zA-Z_0-9]"); // word + case 'W': + return new RegExp87("[^\\w]"); // non-word + default: + throw new IllegalArgumentException( + "invalid character class " + from); + } + } + + + final RegExp87 matchPredefinedCharacterClass() { + //See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html + if (match('\\')) { + if (peek("dDwWsS")) { + return newLeafNode(flags, Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0); + } + + if (peek("\\")) { + return makeChar(flags, next()); + } + + // From https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs + // "It is an error to use a backslash prior to any alphabetic character that does not denote an escaped + // construct;" + if (peek("abcefghijklmnopqrtuvxyz") || peek("ABCEFGHIJKLMNOPQRTUVXYZ")) { + throw new IllegalArgumentException("invalid character class \\" + next()); + } + } + + return null; + } + + + final RegExp87 parseSimpleExp() throws IllegalArgumentException { + if (match('.')) return makeAnyChar(flags); + else if (check(EMPTY) && match('#')) return makeEmpty(flags); + else if (check(ANYSTRING) && match('@')) return makeAnyString(flags); + else if (match('"')) { + int start = pos; + while (more() && !peek("\"")) + next(); + if (!match('"')) throw new IllegalArgumentException( + "expected '\"' at position " + pos); + return makeString(flags, originalString.substring(start, pos - 1)); + } else if (match('(')) { + if (match(')')) return makeString(flags, ""); + RegExp87 e = parseUnionExp(); + if (!match(')')) throw new IllegalArgumentException( + "expected ')' at position " + pos); + return e; + } else if ((check(AUTOMATON) || check(INTERVAL)) && match('<')) { + int start = pos; + while (more() && !peek(">")) + next(); + if (!match('>')) throw new IllegalArgumentException( + "expected '>' at position " + pos); + String s = originalString.substring(start, pos - 1); + int i = s.indexOf('-'); + if (i == -1) { + if (!check(AUTOMATON)) throw new IllegalArgumentException( + "interval syntax error at position " + (pos - 1)); + return makeAutomaton(flags, s); + } else { + if (!check(INTERVAL)) throw new IllegalArgumentException( + "illegal identifier at position " + (pos - 1)); + try { + if (i == 0 || i == s.length() - 1 || i != s.lastIndexOf('-')) throw new NumberFormatException(); + String smin = s.substring(0, i); + String smax = s.substring(i + 1, s.length()); + int imin = Integer.parseInt(smin); + int imax = Integer.parseInt(smax); + int digits; + if (smin.length() == smax.length()) digits = smin.length(); + else digits = 0; + if (imin > imax) { + int t = imin; + imin = imax; + imax = t; + } + return makeInterval(flags, imin, imax, digits); + } catch (NumberFormatException e) { + throw new IllegalArgumentException( + "interval syntax error at position " + (pos - 1)); + } + } + } else { + RegExp87 predefined = matchPredefinedCharacterClass(); + if (predefined != null) { + return predefined; + } + return makeChar(flags, parseCharExp()); + } + } + + final int parseCharExp() throws IllegalArgumentException { + match('\\'); + return next(); + } +} diff --git a/server/src/main/java/org/apache/lucene/search/RegexpQuery87.java b/server/src/main/java/org/apache/lucene/search/RegexpQuery87.java new file mode 100644 index 00000000000..8bf07e3ad6d --- /dev/null +++ b/server/src/main/java/org/apache/lucene/search/RegexpQuery87.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + + +import org.apache.lucene.index.Term; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.AutomatonProvider; +import org.apache.lucene.util.automaton.Operations; + +/** + * Copy of Lucene's RegExpQuery class coming in 8.7 with case + * insensitive search option + * @deprecated + */ +@Deprecated +public class RegexpQuery87 extends AutomatonQuery { + /** + * A provider that provides no named automata + */ + private static AutomatonProvider defaultProvider = new AutomatonProvider() { + @Override + public Automaton getAutomaton(String name) { + return null; + } + }; + + /** + * Constructs a query for terms matching term. + *

+ * By default, all regular expression features are enabled. + *

+ * + * @param term regular expression. + */ + public RegexpQuery87(Term term) { + this(term, RegExp87.ALL); + } + + /** + * Constructs a query for terms matching term. + * + * @param term regular expression. + * @param flags optional RegExp features from {@link RegExp87} + */ + public RegexpQuery87(Term term, int flags) { + this(term, flags, defaultProvider, + Operations.DEFAULT_MAX_DETERMINIZED_STATES); + } + + /** + * Constructs a query for terms matching term. + * + * @param term regular expression. + * @param flags optional RegExp syntax features from {@link RegExp87} + * @param maxDeterminizedStates maximum number of states that compiling the + * automaton for the regexp can result in. Set higher to allow more complex + * queries and lower to prevent memory exhaustion. + */ + public RegexpQuery87(Term term, int flags, int maxDeterminizedStates) { + this(term, flags, defaultProvider, maxDeterminizedStates); + } + + /** + * Constructs a query for terms matching term. + * + * @param term regular expression. + * @param syntax_flags optional RegExp syntax features from {@link RegExp87} + * automaton for the regexp can result in. Set higher to allow more complex + * queries and lower to prevent memory exhaustion. + * @param match_flags boolean 'or' of match behavior options such as case insensitivity + * @param maxDeterminizedStates maximum number of states that compiling the + */ + public RegexpQuery87(Term term, int syntax_flags, int match_flags, int maxDeterminizedStates) { + this(term, syntax_flags, match_flags, defaultProvider, maxDeterminizedStates); + } + + /** + * Constructs a query for terms matching term. + * + * @param term regular expression. + * @param syntax_flags optional RegExp features from {@link RegExp87} + * @param provider custom AutomatonProvider for named automata + * @param maxDeterminizedStates maximum number of states that compiling the + * automaton for the regexp can result in. Set higher to allow more complex + * queries and lower to prevent memory exhaustion. + */ + public RegexpQuery87(Term term, int syntax_flags, AutomatonProvider provider, + int maxDeterminizedStates) { + this(term, syntax_flags, 0, provider, maxDeterminizedStates); + } + + /** + * Constructs a query for terms matching term. + * + * @param term regular expression. + * @param syntax_flags optional RegExp features from {@link RegExp87} + * @param match_flags boolean 'or' of match behavior options such as case insensitivity + * @param provider custom AutomatonProvider for named automata + * @param maxDeterminizedStates maximum number of states that compiling the + * automaton for the regexp can result in. Set higher to allow more complex + * queries and lower to prevent memory exhaustion. + */ + public RegexpQuery87(Term term, int syntax_flags, int match_flags, AutomatonProvider provider, + int maxDeterminizedStates) { + super(term, + new RegExp87(term.text(), syntax_flags, match_flags).toAutomaton( + provider, maxDeterminizedStates), maxDeterminizedStates); + } + + /** Returns the regexp of this query wrapped in a Term. */ + public Term getRegexp() { + return term; + } + + /** Prints a user-readable version of this query. */ + @Override + public String toString(String field) { + StringBuilder buffer = new StringBuilder(); + if (!term.field().equals(field)) { + buffer.append(term.field()); + buffer.append(":"); + } + buffer.append('/'); + buffer.append(term.text()); + buffer.append('/'); + return buffer.toString(); + } +} diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java index 1eb9fd0330c..dd538cdb012 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java @@ -214,8 +214,8 @@ public abstract class MappedFieldType { + "] which is of type [" + typeName() + "]"); } - public Query regexpQuery(String value, int flags, int maxDeterminizedStates, @Nullable MultiTermQuery.RewriteMethod method, - QueryShardContext context) { + public Query regexpQuery(String value, int syntaxFlags, int matchFlags, int maxDeterminizedStates, + @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) { throw new QueryShardException(context, "Can only use regexp queries on keyword and text fields - not on [" + name + "] which is of type [" + typeName() + "]"); } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java index c6a7104973d..b38ebfdd32a 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java @@ -25,7 +25,7 @@ import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.RegexpQuery87; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; @@ -134,14 +134,15 @@ public abstract class StringFieldType extends TermBasedFieldType { } @Override - public Query regexpQuery(String value, int flags, int maxDeterminizedStates, + public Query regexpQuery(String value, int syntaxFlags, int matchFlags, int maxDeterminizedStates, MultiTermQuery.RewriteMethod method, QueryShardContext context) { if (context.allowExpensiveQueries() == false) { throw new ElasticsearchException("[regexp] queries cannot be executed when '" + ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false."); } failIfNotIndexed(); - RegexpQuery query = new RegexpQuery(new Term(name(), indexedValueForSearch(value)), flags, maxDeterminizedStates); + RegexpQuery87 query = new RegexpQuery87(new Term(name(), indexedValueForSearch(value)), syntaxFlags, + matchFlags, maxDeterminizedStates); if (method != null) { query.setRewriteMethod(method); } diff --git a/server/src/main/java/org/elasticsearch/index/query/RegexpFlag.java b/server/src/main/java/org/elasticsearch/index/query/RegexpFlag.java index 669c885276f..37388cd5cf2 100644 --- a/server/src/main/java/org/elasticsearch/index/query/RegexpFlag.java +++ b/server/src/main/java/org/elasticsearch/index/query/RegexpFlag.java @@ -18,7 +18,7 @@ */ package org.elasticsearch.index.query; -import org.apache.lucene.util.automaton.RegExp; +import org.apache.lucene.search.RegExp87; import org.elasticsearch.common.Strings; import java.util.Locale; @@ -43,37 +43,37 @@ public enum RegexpFlag { /** * Enables intersection of the form: {@code <expression> & <expression>} */ - INTERSECTION(RegExp.INTERSECTION), + INTERSECTION(RegExp87.INTERSECTION), /** * Enables complement expression of the form: {@code ~<expression>} */ - COMPLEMENT(RegExp.COMPLEMENT), + COMPLEMENT(RegExp87.COMPLEMENT), /** * Enables empty language expression: {@code #} */ - EMPTY(RegExp.EMPTY), + EMPTY(RegExp87.EMPTY), /** * Enables any string expression: {@code @} */ - ANYSTRING(RegExp.ANYSTRING), + ANYSTRING(RegExp87.ANYSTRING), /** * Enables numerical interval expression: {@code <n-m>} */ - INTERVAL(RegExp.INTERVAL), + INTERVAL(RegExp87.INTERVAL), /** * Disables all available option flags */ - NONE(RegExp.NONE), + NONE(RegExp87.NONE), /** * Enables all available option flags */ - ALL(RegExp.ALL); + ALL(RegExp87.ALL); final int value; @@ -110,9 +110,9 @@ public enum RegexpFlag { */ public static int resolveValue(String flags) { if (flags == null || flags.isEmpty()) { - return RegExp.ALL; + return RegExp87.ALL; } - int magic = RegExp.NONE; + int magic = RegExp87.NONE; for (String s : Strings.delimitedListToStringArray(flags, "|")) { if (s.isEmpty()) { continue; diff --git a/server/src/main/java/org/elasticsearch/index/query/RegexpQueryBuilder.java b/server/src/main/java/org/elasticsearch/index/query/RegexpQueryBuilder.java index 472c1014874..0a99cc7e7ac 100644 --- a/server/src/main/java/org/elasticsearch/index/query/RegexpQueryBuilder.java +++ b/server/src/main/java/org/elasticsearch/index/query/RegexpQueryBuilder.java @@ -22,8 +22,10 @@ package org.elasticsearch.index.query; import org.apache.lucene.index.Term; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.RegExp87; +import org.apache.lucene.search.RegexpQuery87; import org.apache.lucene.util.automaton.Operations; +import org.elasticsearch.Version; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.ParsingException; import org.elasticsearch.common.Strings; @@ -48,10 +50,12 @@ public class RegexpQueryBuilder extends AbstractQueryBuilder public static final int DEFAULT_FLAGS_VALUE = RegexpFlag.ALL.value(); public static final int DEFAULT_MAX_DETERMINIZED_STATES = Operations.DEFAULT_MAX_DETERMINIZED_STATES; + public static final boolean DEFAULT_CASE_INSENSITIVITY = false; private static final ParseField FLAGS_VALUE_FIELD = new ParseField("flags_value"); private static final ParseField MAX_DETERMINIZED_STATES_FIELD = new ParseField("max_determinized_states"); private static final ParseField FLAGS_FIELD = new ParseField("flags"); + private static final ParseField CASE_INSENSITIVE_FIELD = new ParseField("case_insensitive"); private static final ParseField REWRITE_FIELD = new ParseField("rewrite"); private static final ParseField VALUE_FIELD = new ParseField("value"); @@ -59,7 +63,8 @@ public class RegexpQueryBuilder extends AbstractQueryBuilder private final String value; - private int flagsValue = DEFAULT_FLAGS_VALUE; + private int syntaxFlagsValue = DEFAULT_FLAGS_VALUE; + private boolean caseInsensitive = DEFAULT_CASE_INSENSITIVITY; private int maxDeterminizedStates = DEFAULT_MAX_DETERMINIZED_STATES; @@ -89,18 +94,24 @@ public class RegexpQueryBuilder extends AbstractQueryBuilder super(in); fieldName = in.readString(); value = in.readString(); - flagsValue = in.readVInt(); + syntaxFlagsValue = in.readVInt(); maxDeterminizedStates = in.readVInt(); rewrite = in.readOptionalString(); + if (in.getVersion().onOrAfter(Version.V_7_10_0)) { + caseInsensitive = in.readBoolean(); + } } @Override protected void doWriteTo(StreamOutput out) throws IOException { out.writeString(fieldName); out.writeString(value); - out.writeVInt(flagsValue); + out.writeVInt(syntaxFlagsValue); out.writeVInt(maxDeterminizedStates); out.writeOptionalString(rewrite); + if (out.getVersion().onOrAfter(Version.V_7_10_0)) { + out.writeBoolean(caseInsensitive); + } } /** Returns the field name used in this query. */ @@ -118,7 +129,7 @@ public class RegexpQueryBuilder extends AbstractQueryBuilder public RegexpQueryBuilder flags(RegexpFlag... flags) { if (flags == null) { - this.flagsValue = DEFAULT_FLAGS_VALUE; + this.syntaxFlagsValue = DEFAULT_FLAGS_VALUE; return this; } int value = 0; @@ -129,19 +140,31 @@ public class RegexpQueryBuilder extends AbstractQueryBuilder value |= flag.value; } } - this.flagsValue = value; + this.syntaxFlagsValue = value; return this; } public RegexpQueryBuilder flags(int flags) { - this.flagsValue = flags; + this.syntaxFlagsValue = flags; return this; } public int flags() { - return this.flagsValue; + return this.syntaxFlagsValue; } + + public RegexpQueryBuilder caseInsensitive(boolean caseInsensitive) { + if (caseInsensitive == false) { + throw new IllegalArgumentException("The case insensitive setting cannot be set to false."); + } + this.caseInsensitive = caseInsensitive; + return this; + } + public boolean caseInsensitive() { + return this.caseInsensitive; + } + /** * Sets the regexp maxDeterminizedStates. */ @@ -168,7 +191,10 @@ public class RegexpQueryBuilder extends AbstractQueryBuilder builder.startObject(NAME); builder.startObject(fieldName); builder.field(VALUE_FIELD.getPreferredName(), this.value); - builder.field(FLAGS_VALUE_FIELD.getPreferredName(), flagsValue); + builder.field(FLAGS_VALUE_FIELD.getPreferredName(), syntaxFlagsValue); + if (caseInsensitive != DEFAULT_CASE_INSENSITIVITY) { + builder.field(CASE_INSENSITIVE_FIELD.getPreferredName(), caseInsensitive); + } builder.field(MAX_DETERMINIZED_STATES_FIELD.getPreferredName(), maxDeterminizedStates); if (rewrite != null) { builder.field(REWRITE_FIELD.getPreferredName(), rewrite); @@ -184,6 +210,7 @@ public class RegexpQueryBuilder extends AbstractQueryBuilder String value = null; float boost = AbstractQueryBuilder.DEFAULT_BOOST; int flagsValue = RegexpQueryBuilder.DEFAULT_FLAGS_VALUE; + boolean caseInsensitive = DEFAULT_CASE_INSENSITIVITY; int maxDeterminizedStates = RegexpQueryBuilder.DEFAULT_MAX_DETERMINIZED_STATES; String queryName = null; String currentFieldName = null; @@ -211,6 +238,12 @@ public class RegexpQueryBuilder extends AbstractQueryBuilder maxDeterminizedStates = parser.intValue(); } else if (FLAGS_VALUE_FIELD.match(currentFieldName, parser.getDeprecationHandler())) { flagsValue = parser.intValue(); + } else if (CASE_INSENSITIVE_FIELD.match(currentFieldName, parser.getDeprecationHandler())) { + caseInsensitive = parser.booleanValue(); + if (caseInsensitive == false) { + throw new ParsingException(parser.getTokenLocation(), + "[regexp] query does not support [" + currentFieldName + "] = false"); + } } else if (AbstractQueryBuilder.NAME_FIELD.match(currentFieldName, parser.getDeprecationHandler())) { queryName = parser.text(); } else { @@ -226,12 +259,16 @@ public class RegexpQueryBuilder extends AbstractQueryBuilder } } - return new RegexpQueryBuilder(fieldName, value) + RegexpQueryBuilder result = new RegexpQueryBuilder(fieldName, value) .flags(flagsValue) .maxDeterminizedStates(maxDeterminizedStates) .rewrite(rewrite) .boost(boost) .queryName(queryName); + if (caseInsensitive) { + result.caseInsensitive(caseInsensitive); + } + return result; } @Override @@ -251,13 +288,18 @@ public class RegexpQueryBuilder extends AbstractQueryBuilder } MultiTermQuery.RewriteMethod method = QueryParsers.parseRewriteMethod(rewrite, null, LoggingDeprecationHandler.INSTANCE); + int matchFlagsValue = caseInsensitive ? RegExp87.ASCII_CASE_INSENSITIVE : 0; Query query = null; + // For BWC we mask irrelevant bits (RegExp changed ALL from 0xffff to 0xff) + int sanitisedSyntaxFlag = syntaxFlagsValue & RegExp87.ALL; + MappedFieldType fieldType = context.fieldMapper(fieldName); if (fieldType != null) { - query = fieldType.regexpQuery(value, flagsValue, maxDeterminizedStates, method, context); + query = fieldType.regexpQuery(value, sanitisedSyntaxFlag, matchFlagsValue, maxDeterminizedStates, method, context); } if (query == null) { - RegexpQuery regexpQuery = new RegexpQuery(new Term(fieldName, BytesRefs.toBytesRef(value)), flagsValue, maxDeterminizedStates); + RegexpQuery87 regexpQuery = new RegexpQuery87(new Term(fieldName, BytesRefs.toBytesRef(value)), sanitisedSyntaxFlag, + matchFlagsValue, maxDeterminizedStates); if (method != null) { regexpQuery.setRewriteMethod(method); } @@ -268,14 +310,15 @@ public class RegexpQueryBuilder extends AbstractQueryBuilder @Override protected int doHashCode() { - return Objects.hash(fieldName, value, flagsValue, maxDeterminizedStates, rewrite); + return Objects.hash(fieldName, value, syntaxFlagsValue, caseInsensitive, maxDeterminizedStates, rewrite); } @Override protected boolean doEquals(RegexpQueryBuilder other) { return Objects.equals(fieldName, other.fieldName) && Objects.equals(value, other.value) && - Objects.equals(flagsValue, other.flagsValue) && + Objects.equals(syntaxFlagsValue, other.syntaxFlagsValue) && + Objects.equals(caseInsensitive, other.caseInsensitive) && Objects.equals(maxDeterminizedStates, other.maxDeterminizedStates) && Objects.equals(rewrite, other.rewrite); } diff --git a/server/src/main/java/org/elasticsearch/index/search/QueryStringQueryParser.java b/server/src/main/java/org/elasticsearch/index/search/QueryStringQueryParser.java index 8120dc68a22..b03a4cf5e64 100644 --- a/server/src/main/java/org/elasticsearch/index/search/QueryStringQueryParser.java +++ b/server/src/main/java/org/elasticsearch/index/search/QueryStringQueryParser.java @@ -37,13 +37,13 @@ import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.RegExp87; import org.apache.lucene.search.SynonymQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.automaton.RegExp; import org.elasticsearch.common.lucene.search.Queries; import org.elasticsearch.common.regex.Regex; import org.elasticsearch.common.unit.Fuzziness; @@ -731,7 +731,7 @@ public class QueryStringQueryParser extends XQueryParser { setAnalyzer(forceAnalyzer); return super.getRegexpQuery(field, termStr); } - return currentFieldType.regexpQuery(termStr, RegExp.ALL, getMaxDeterminizedStates(), + return currentFieldType.regexpQuery(termStr, RegExp87.ALL, 0, getMaxDeterminizedStates(), getMultiTermRewriteMethod(), context); } catch (RuntimeException e) { if (lenient) { diff --git a/server/src/main/java/org/elasticsearch/search/suggest/completion/RegexOptions.java b/server/src/main/java/org/elasticsearch/search/suggest/completion/RegexOptions.java index 814fca29b0f..d7d84bb4ceb 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/completion/RegexOptions.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/completion/RegexOptions.java @@ -19,8 +19,8 @@ package org.elasticsearch.search.suggest.completion; +import org.apache.lucene.search.RegExp87; import org.apache.lucene.util.automaton.Operations; -import org.apache.lucene.util.automaton.RegExp; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.io.stream.StreamInput; @@ -143,7 +143,7 @@ public class RegexOptions implements ToXContentFragment, Writeable { * Options for regular expression queries */ public static class Builder { - private int flagsValue = RegExp.ALL; + private int flagsValue = RegExp87.ALL; private int maxDeterminizedStates = Operations.DEFAULT_MAX_DETERMINIZED_STATES; public Builder() { diff --git a/server/src/test/java/org/elasticsearch/index/mapper/CompletionFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/CompletionFieldMapperTests.java index 29ac6f6cccb..1c691e456a9 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/CompletionFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/CompletionFieldMapperTests.java @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.index.IndexableField; import org.apache.lucene.search.Query; +import org.apache.lucene.search.RegExp87; import org.apache.lucene.search.suggest.document.CompletionAnalyzer; import org.apache.lucene.search.suggest.document.ContextSuggestField; import org.apache.lucene.search.suggest.document.FuzzyCompletionQuery; @@ -31,7 +32,6 @@ import org.apache.lucene.search.suggest.document.SuggestField; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.automaton.Operations; -import org.apache.lucene.util.automaton.RegExp; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.common.Strings; @@ -889,7 +889,7 @@ public class CompletionFieldMapperTests extends ESSingleNodeTestCase { Mapper fieldMapper = defaultMapper.mappers().getMapper("completion"); CompletionFieldMapper completionFieldMapper = (CompletionFieldMapper) fieldMapper; Query prefixQuery = completionFieldMapper.fieldType() - .regexpQuery(new BytesRef("co"), RegExp.ALL, Operations.DEFAULT_MAX_DETERMINIZED_STATES); + .regexpQuery(new BytesRef("co"), RegExp87.ALL, Operations.DEFAULT_MAX_DETERMINIZED_STATES); assertThat(prefixQuery, instanceOf(RegexCompletionQuery.class)); } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/IgnoredFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/IgnoredFieldTypeTests.java index 602287030dc..c864a9c7742 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/IgnoredFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/IgnoredFieldTypeTests.java @@ -22,7 +22,7 @@ package org.elasticsearch.index.mapper; import org.apache.lucene.index.Term; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.RegexpQuery87; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; import org.elasticsearch.ElasticsearchException; @@ -44,11 +44,11 @@ public class IgnoredFieldTypeTests extends FieldTypeTestCase { public void testRegexpQuery() { MappedFieldType ft = IgnoredFieldMapper.IgnoredFieldType.INSTANCE; - Query expected = new RegexpQuery(new Term("_ignored", new BytesRef("foo?"))); - assertEquals(expected, ft.regexpQuery("foo?", 0, 10, null, MOCK_QSC)); + Query expected = new RegexpQuery87(new Term("_ignored", new BytesRef("foo?"))); + assertEquals(expected, ft.regexpQuery("foo?", 0, 0, 10, null, MOCK_QSC)); ElasticsearchException ee = expectThrows(ElasticsearchException.class, - () -> ft.regexpQuery("foo?", randomInt(10), randomInt(10) + 1, null, MOCK_QSC_DISALLOW_EXPENSIVE)); + () -> ft.regexpQuery("foo?", randomInt(10), 0, randomInt(10) + 1, null, MOCK_QSC_DISALLOW_EXPENSIVE)); assertEquals("[regexp] queries cannot be executed when 'search.allow_expensive_queries' is set to false.", ee.getMessage()); } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/IndexFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/IndexFieldTypeTests.java index 11dcfe4f23f..d7041ed4b67 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/IndexFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/IndexFieldTypeTests.java @@ -53,7 +53,7 @@ public class IndexFieldTypeTests extends ESTestCase { MappedFieldType ft = IndexFieldMapper.IndexFieldType.INSTANCE; QueryShardException e = expectThrows(QueryShardException.class, () -> - assertEquals(new MatchAllDocsQuery(), ft.regexpQuery("ind.x", 0, 10, null, createContext()))); + assertEquals(new MatchAllDocsQuery(), ft.regexpQuery("ind.x", 0, 0, 10, null, createContext()))); assertThat(e.getMessage(), containsString("Can only use regexp queries on keyword and text fields")); } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java index a6f0a7ee5f0..85718f9fc86 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java @@ -29,7 +29,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.DocValuesFieldExistsQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.NormsFieldExistsQuery; -import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.RegexpQuery87; import org.apache.lucene.search.TermInSetQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; @@ -128,16 +128,16 @@ public class KeywordFieldTypeTests extends FieldTypeTestCase { public void testRegexpQuery() { MappedFieldType ft = new KeywordFieldType("field"); - assertEquals(new RegexpQuery(new Term("field","foo.*")), - ft.regexpQuery("foo.*", 0, 10, null, MOCK_QSC)); + assertEquals(new RegexpQuery87(new Term("field","foo.*")), + ft.regexpQuery("foo.*", 0, 0, 10, null, MOCK_QSC)); MappedFieldType unsearchable = new KeywordFieldType("field", false, true, Collections.emptyMap()); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> unsearchable.regexpQuery("foo.*", 0, 10, null, MOCK_QSC)); + () -> unsearchable.regexpQuery("foo.*", 0, 0, 10, null, MOCK_QSC)); assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage()); ElasticsearchException ee = expectThrows(ElasticsearchException.class, - () -> ft.regexpQuery("foo.*", randomInt(10), randomInt(10) + 1, null, MOCK_QSC_DISALLOW_EXPENSIVE)); + () -> ft.regexpQuery("foo.*", randomInt(10), 0, randomInt(10) + 1, null, MOCK_QSC_DISALLOW_EXPENSIVE)); assertEquals("[regexp] queries cannot be executed when 'search.allow_expensive_queries' is set to false.", ee.getMessage()); } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/RoutingFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/RoutingFieldTypeTests.java index 49855b4096f..d4af02d3567 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/RoutingFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/RoutingFieldTypeTests.java @@ -21,7 +21,7 @@ package org.elasticsearch.index.mapper; import org.apache.lucene.index.Term; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.RegexpQuery87; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; import org.elasticsearch.ElasticsearchException; @@ -43,11 +43,11 @@ public class RoutingFieldTypeTests extends FieldTypeTestCase { public void testRegexpQuery() { MappedFieldType ft = RoutingFieldMapper.RoutingFieldType.INSTANCE; - Query expected = new RegexpQuery(new Term("_routing", new BytesRef("foo?"))); - assertEquals(expected, ft.regexpQuery("foo?", 0, 10, null, MOCK_QSC)); + Query expected = new RegexpQuery87(new Term("_routing", new BytesRef("foo?"))); + assertEquals(expected, ft.regexpQuery("foo?", 0, 0, 10, null, MOCK_QSC)); ElasticsearchException ee = expectThrows(ElasticsearchException.class, - () -> ft.regexpQuery("foo?", randomInt(10), randomInt(10) + 1, null, MOCK_QSC_DISALLOW_EXPENSIVE)); + () -> ft.regexpQuery("foo?", randomInt(10), 0, randomInt(10) + 1, null, MOCK_QSC_DISALLOW_EXPENSIVE)); assertEquals("[regexp] queries cannot be executed when 'search.allow_expensive_queries' is set to false.", ee.getMessage()); } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldTypeTests.java index 43a1ff09314..844d6c0cc22 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldTypeTests.java @@ -26,7 +26,7 @@ import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.RegexpQuery87; import org.apache.lucene.search.TermInSetQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; @@ -86,16 +86,16 @@ public class TextFieldTypeTests extends FieldTypeTestCase { public void testRegexpQuery() { MappedFieldType ft = new TextFieldType("field"); - assertEquals(new RegexpQuery(new Term("field","foo.*")), - ft.regexpQuery("foo.*", 0, 10, null, MOCK_QSC)); + assertEquals(new RegexpQuery87(new Term("field","foo.*")), + ft.regexpQuery("foo.*", 0, 0, 10, null, MOCK_QSC)); MappedFieldType unsearchable = new TextFieldType("field", false, Collections.emptyMap()); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> unsearchable.regexpQuery("foo.*", 0, 10, null, MOCK_QSC)); + () -> unsearchable.regexpQuery("foo.*", 0, 0, 10, null, MOCK_QSC)); assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage()); ElasticsearchException ee = expectThrows(ElasticsearchException.class, - () -> ft.regexpQuery("foo.*", randomInt(10), randomInt(10) + 1, null, MOCK_QSC_DISALLOW_EXPENSIVE)); + () -> ft.regexpQuery("foo.*", randomInt(10), 0, randomInt(10) + 1, null, MOCK_QSC_DISALLOW_EXPENSIVE)); assertEquals("[regexp] queries cannot be executed when 'search.allow_expensive_queries' is set to false.", ee.getMessage()); } diff --git a/server/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java index f7993ae7e16..f4e21b39888 100644 --- a/server/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java @@ -40,7 +40,7 @@ import org.apache.lucene.search.NormsFieldExistsQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.RegexpQuery87; import org.apache.lucene.search.SynonymQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; @@ -733,8 +733,8 @@ public class QueryStringQueryBuilderTests extends AbstractQueryTestCase parseQuery(shortJson)); assertEquals("[regexp] query doesn't support multiple fields, found [user1] and [user2]", e.getMessage()); - } + } + + public void testParseFailsWithCaseSensitive() throws IOException { + String json = + "{\n" + + " \"regexp\": {\n" + + " \"user1\": {\n" + + " \"value\": \"k.*y\",\n" + + " \"case_insensitive\": false\n" + + " },\n" + + " }\n" + + "}"; + ParsingException e = expectThrows(ParsingException.class, () -> parseQuery(json)); + assertEquals("[regexp] query does not support [case_insensitive] = false", e.getMessage()); + } + + public void testDeadCode() { + assertTrue(RegExp87.class + " should be replaced with 8.7's "+RegExp.class, + org.apache.lucene.util.Version.LATEST.major == 8 && org.apache.lucene.util.Version.LATEST.minor < 7); + } + + } diff --git a/x-pack/plugin/mapper-constant-keyword/src/main/java/org/elasticsearch/xpack/constantkeyword/mapper/ConstantKeywordFieldMapper.java b/x-pack/plugin/mapper-constant-keyword/src/main/java/org/elasticsearch/xpack/constantkeyword/mapper/ConstantKeywordFieldMapper.java index 50b72022fe1..6096977cfac 100644 --- a/x-pack/plugin/mapper-constant-keyword/src/main/java/org/elasticsearch/xpack/constantkeyword/mapper/ConstantKeywordFieldMapper.java +++ b/x-pack/plugin/mapper-constant-keyword/src/main/java/org/elasticsearch/xpack/constantkeyword/mapper/ConstantKeywordFieldMapper.java @@ -13,12 +13,12 @@ import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.RegExp87; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.LevenshteinAutomata; -import org.apache.lucene.util.automaton.RegExp; import org.elasticsearch.common.geo.ShapeRelation; import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.common.regex.Regex; @@ -208,13 +208,13 @@ public class ConstantKeywordFieldMapper extends FieldMapper { } @Override - public Query regexpQuery(String value, int flags, int maxDeterminizedStates, + public Query regexpQuery(String value, int syntaxFlags, int matchFlags, int maxDeterminizedStates, MultiTermQuery.RewriteMethod method, QueryShardContext context) { if (this.value == null) { return new MatchNoDocsQuery(); } - final Automaton automaton = new RegExp(value, flags).toAutomaton(maxDeterminizedStates); + final Automaton automaton = new RegExp87(value, syntaxFlags, matchFlags).toAutomaton(maxDeterminizedStates); final CharacterRunAutomaton runAutomaton = new CharacterRunAutomaton(automaton); if (runAutomaton.run(this.value)) { return new MatchAllDocsQuery(); diff --git a/x-pack/plugin/mapper-constant-keyword/src/test/java/org/elasticsearch/xpack/constantkeyword/mapper/ConstantKeywordFieldTypeTests.java b/x-pack/plugin/mapper-constant-keyword/src/test/java/org/elasticsearch/xpack/constantkeyword/mapper/ConstantKeywordFieldTypeTests.java index 3952e33a730..9b5831cedf8 100644 --- a/x-pack/plugin/mapper-constant-keyword/src/test/java/org/elasticsearch/xpack/constantkeyword/mapper/ConstantKeywordFieldTypeTests.java +++ b/x-pack/plugin/mapper-constant-keyword/src/test/java/org/elasticsearch/xpack/constantkeyword/mapper/ConstantKeywordFieldTypeTests.java @@ -8,7 +8,7 @@ package org.elasticsearch.xpack.constantkeyword.mapper; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MatchNoDocsQuery; -import org.apache.lucene.util.automaton.RegExp; +import org.apache.lucene.search.RegExp87; import org.elasticsearch.common.unit.Fuzziness; import org.elasticsearch.index.mapper.FieldTypeTestCase; import org.elasticsearch.xpack.constantkeyword.mapper.ConstantKeywordFieldMapper.ConstantKeywordFieldType; @@ -86,9 +86,9 @@ public class ConstantKeywordFieldTypeTests extends FieldTypeTestCase { public void testRegexpQuery() { ConstantKeywordFieldType none = new ConstantKeywordFieldType("f", null); - assertEquals(new MatchNoDocsQuery(), none.regexpQuery("f..o", RegExp.ALL, 10, null, null)); + assertEquals(new MatchNoDocsQuery(), none.regexpQuery("f..o", RegExp87.ALL, 0, 10, null, null)); ConstantKeywordFieldType ft = new ConstantKeywordFieldType("f", "foo"); - assertEquals(new MatchAllDocsQuery(), ft.regexpQuery("f.o", RegExp.ALL, 10, null, null)); - assertEquals(new MatchNoDocsQuery(), ft.regexpQuery("f..o", RegExp.ALL, 10, null, null)); + assertEquals(new MatchAllDocsQuery(), ft.regexpQuery("f.o", RegExp87.ALL, 0, 10, null, null)); + assertEquals(new MatchNoDocsQuery(), ft.regexpQuery("f..o", RegExp87.ALL, 0, 10, null, null)); } } diff --git a/x-pack/plugin/mapper-flattened/src/main/java/org/elasticsearch/xpack/flattened/mapper/FlatObjectFieldMapper.java b/x-pack/plugin/mapper-flattened/src/main/java/org/elasticsearch/xpack/flattened/mapper/FlatObjectFieldMapper.java index d79b937ee18..203bde6e37b 100644 --- a/x-pack/plugin/mapper-flattened/src/main/java/org/elasticsearch/xpack/flattened/mapper/FlatObjectFieldMapper.java +++ b/x-pack/plugin/mapper-flattened/src/main/java/org/elasticsearch/xpack/flattened/mapper/FlatObjectFieldMapper.java @@ -291,7 +291,7 @@ public final class FlatObjectFieldMapper extends DynamicKeyFieldMapper { } @Override - public Query regexpQuery(String value, int flags, int maxDeterminizedStates, + public Query regexpQuery(String value, int syntaxFlags, int matchFlags, int maxDeterminizedStates, MultiTermQuery.RewriteMethod method, QueryShardContext context) { throw new UnsupportedOperationException("[regexp] queries are not currently supported on keyed " + "[" + CONTENT_TYPE + "] fields."); diff --git a/x-pack/plugin/mapper-flattened/src/test/java/org/elasticsearch/xpack/flattened/mapper/KeyedFlatObjectFieldTypeTests.java b/x-pack/plugin/mapper-flattened/src/test/java/org/elasticsearch/xpack/flattened/mapper/KeyedFlatObjectFieldTypeTests.java index 09d6afb6710..2ccb714d0bc 100644 --- a/x-pack/plugin/mapper-flattened/src/test/java/org/elasticsearch/xpack/flattened/mapper/KeyedFlatObjectFieldTypeTests.java +++ b/x-pack/plugin/mapper-flattened/src/test/java/org/elasticsearch/xpack/flattened/mapper/KeyedFlatObjectFieldTypeTests.java @@ -130,7 +130,7 @@ public class KeyedFlatObjectFieldTypeTests extends FieldTypeTestCase { KeyedFlatObjectFieldType ft = createFieldType(); UnsupportedOperationException e = expectThrows(UnsupportedOperationException.class, - () -> ft.regexpQuery("valu*", 0, 10, null, randomMockShardContext())); + () -> ft.regexpQuery("valu*", 0, 0, 10, null, randomMockShardContext())); assertEquals("[regexp] queries are not currently supported on keyed [flattened] fields.", e.getMessage()); } diff --git a/x-pack/plugin/mapper-flattened/src/test/java/org/elasticsearch/xpack/flattened/mapper/RootFlatObjectFieldTypeTests.java b/x-pack/plugin/mapper-flattened/src/test/java/org/elasticsearch/xpack/flattened/mapper/RootFlatObjectFieldTypeTests.java index 3f049b003dd..dc61b4298b9 100644 --- a/x-pack/plugin/mapper-flattened/src/test/java/org/elasticsearch/xpack/flattened/mapper/RootFlatObjectFieldTypeTests.java +++ b/x-pack/plugin/mapper-flattened/src/test/java/org/elasticsearch/xpack/flattened/mapper/RootFlatObjectFieldTypeTests.java @@ -10,7 +10,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.DocValuesFieldExistsQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.RegexpQuery87; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.WildcardQuery; @@ -96,12 +96,12 @@ public class RootFlatObjectFieldTypeTests extends FieldTypeTestCase { public void testRegexpQuery() { RootFlatObjectFieldType ft = createDefaultFieldType(); - Query expected = new RegexpQuery(new Term("field", "val.*")); - Query actual = ft.regexpQuery("val.*", 0, 10, null, MOCK_QSC); + Query expected = new RegexpQuery87(new Term("field", "val.*")); + Query actual = ft.regexpQuery("val.*", 0, 0, 10, null, MOCK_QSC); assertEquals(expected, actual); ElasticsearchException ee = expectThrows(ElasticsearchException.class, - () -> ft.regexpQuery("val.*", randomInt(10), randomInt(10) + 1, null, MOCK_QSC_DISALLOW_EXPENSIVE)); + () -> ft.regexpQuery("val.*", randomInt(10), 0, randomInt(10) + 1, null, MOCK_QSC_DISALLOW_EXPENSIVE)); assertEquals("[regexp] queries cannot be executed when 'search.allow_expensive_queries' is set to false.", ee.getMessage()); } diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml index 23d9fbad483..e9dc720a5e0 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml @@ -106,6 +106,23 @@ setup: - match: {hits.total.value: 1} +--- +"Case insensitive query": + - skip: + features: headers + version: " - 7.9.99" + reason: "Case insensitive flag added in 7.10" + - do: + search: + body: + track_total_hits: true + query: + regexp: + my_wildcard: {value: ".*Worl.*", case_insensitive: true} + + + - match: {hits.total.value: 3} + --- "null query": - do: diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 1caa6d9f4ff..a8e0c42d05e 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -29,13 +29,13 @@ import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.MultiTermQuery.RewriteMethod; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.RegExp87; +import org.apache.lucene.search.RegExp87.Kind; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.RegExp; -import org.apache.lucene.util.automaton.RegExp.Kind; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.geo.ShapeRelation; import org.elasticsearch.common.lucene.BytesRefs; @@ -294,12 +294,13 @@ public class WildcardFieldMapper extends FieldMapper { } @Override - public Query regexpQuery(String value, int flags, int maxDeterminizedStates, RewriteMethod method, QueryShardContext context) { + public Query regexpQuery(String value, int syntaxFlags, int matchFlags, int maxDeterminizedStates, + RewriteMethod method, QueryShardContext context) { if (value.length() == 0) { return new MatchNoDocsQuery(); } - RegExp ngramRegex = new RegExp(addLineEndChars(toLowerCase(value)), flags); + RegExp87 ngramRegex = new RegExp87(addLineEndChars(toLowerCase(value)), syntaxFlags, matchFlags); Query approxBooleanQuery = toApproximationQuery(ngramRegex); Query approxNgramQuery = rewriteBoolToNgramQuery(approxBooleanQuery); @@ -310,7 +311,7 @@ public class WildcardFieldMapper extends FieldMapper { return existsQuery(context); } Supplier deferredAutomatonSupplier = ()-> { - RegExp regex = new RegExp(value, flags); + RegExp87 regex = new RegExp87(value, syntaxFlags, matchFlags); return regex.toAutomaton(maxDeterminizedStates); }; @@ -339,7 +340,7 @@ public class WildcardFieldMapper extends FieldMapper { // * If an expression resolves to a RegExpQuery eg ?? then only the verification // query is run. // * Anything else is a concrete query that should be run on the ngram index. - public static Query toApproximationQuery(RegExp r) throws IllegalArgumentException { + public static Query toApproximationQuery(RegExp87 r) throws IllegalArgumentException { Query result = null; switch (r.kind) { case REGEXP_UNION: @@ -400,7 +401,7 @@ public class WildcardFieldMapper extends FieldMapper { return result; } - private static Query createConcatenationQuery(RegExp r) { + private static Query createConcatenationQuery(RegExp87 r) { // Create ANDs of expressions plus collapse consecutive TermQuerys into single longer ones ArrayList queries = new ArrayList<>(); findLeaves(r.exp1, Kind.REGEXP_CONCATENATION, queries); @@ -431,7 +432,7 @@ public class WildcardFieldMapper extends FieldMapper { } - private static Query createUnionQuery(RegExp r) { + private static Query createUnionQuery(RegExp87 r) { // Create an OR of clauses ArrayList queries = new ArrayList<>(); findLeaves(r.exp1, Kind.REGEXP_UNION, queries); @@ -458,7 +459,7 @@ public class WildcardFieldMapper extends FieldMapper { return new MatchAllButRequireVerificationQuery(); } - private static void findLeaves(RegExp exp, Kind kind, List queries) { + private static void findLeaves(RegExp87 exp, Kind kind, List queries) { if (exp.kind == kind) { findLeaves(exp.exp1, kind, queries); findLeaves( exp.exp2, kind, queries); diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java index f10a368f3ba..5c1bc799948 100644 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java @@ -26,6 +26,7 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.RegExp87; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; @@ -37,7 +38,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.ByteRunAutomaton; -import org.apache.lucene.util.automaton.RegExp; +//import org.apache.lucene.util.automaton.RegExp; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.common.collect.List; @@ -168,7 +169,7 @@ public class WildcardFieldMapperTests extends ESTestCase { assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L)); // Test regexp query - wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(queryString, RegExp.ALL, 20000, null, MOCK_QSC); + wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(queryString, RegExp87.ALL, 0, 20000, null, MOCK_QSC); wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER); assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L)); @@ -225,8 +226,8 @@ public class WildcardFieldMapperTests extends ESTestCase { break; case 1: pattern = getRandomRegexPattern(values); - wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(pattern, RegExp.ALL, 20000, null, MOCK_QSC); - keywordFieldQuery = keywordFieldType.fieldType().regexpQuery(pattern, RegExp.ALL, 20000, null, MOCK_QSC); + wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(pattern, RegExp87.ALL, 0, 20000, null, MOCK_QSC); + keywordFieldQuery = keywordFieldType.fieldType().regexpQuery(pattern, RegExp87.ALL, 0,20000, null, MOCK_QSC); break; case 2: pattern = randomABString(5); @@ -379,12 +380,12 @@ public class WildcardFieldMapperTests extends ESTestCase { // All these expressions should rewrite to a match all with no verification step required at all String superfastRegexes[]= { ".*", "...*..", "(foo|bar|.*)", "@"}; for (String regex : superfastRegexes) { - Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC); + Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp87.ALL, 0, 20000, null, MOCK_QSC); assertTrue(wildcardFieldQuery instanceof DocValuesFieldExistsQuery); } String matchNoDocsRegexes[]= { ""}; for (String regex : matchNoDocsRegexes) { - Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC); + Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp87.ALL, 0, 20000, null, MOCK_QSC); assertTrue(wildcardFieldQuery instanceof MatchNoDocsQuery); } @@ -404,7 +405,7 @@ public class WildcardFieldMapperTests extends ESTestCase { for (String[] test : acceleratedTests) { String regex = test[0]; String expectedAccelerationQueryString = test[1].replaceAll("_", ""+WildcardFieldMapper.TOKEN_START_OR_END_CHAR); - Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC); + Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp87.ALL, 0, 20000, null, MOCK_QSC); testExpectedAccelerationQuery(regex, wildcardFieldQuery, expectedAccelerationQueryString); } @@ -412,7 +413,7 @@ public class WildcardFieldMapperTests extends ESTestCase { // TODO we can possibly improve on some of these String matchAllButVerifyTests[]= { "..", "(a)?","(a|b){0,3}", "((foo)?|(foo|bar)?)", "@&~(abc.+)", "aaa.+&.+bbb"}; for (String regex : matchAllButVerifyTests) { - Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC); + Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp87.ALL, 0, 20000, null, MOCK_QSC); assertTrue(regex +" was not a pure verify query " +formatQuery(wildcardFieldQuery), wildcardFieldQuery instanceof AutomatonQueryOnBinaryDv); } @@ -428,7 +429,7 @@ public class WildcardFieldMapperTests extends ESTestCase { for (String[] test : suboptimalTests) { String regex = test[0]; String expectedAccelerationQueryString = test[1].replaceAll("_", ""+WildcardFieldMapper.TOKEN_START_OR_END_CHAR); - Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC); + Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp87.ALL, 0, 20000, null, MOCK_QSC); testExpectedAccelerationQuery(regex, wildcardFieldQuery, expectedAccelerationQueryString); } @@ -767,7 +768,7 @@ public class WildcardFieldMapperTests extends ESTestCase { } //Assert our randomly generated regex actually matches the provided raw input. - RegExp regex = new RegExp(result.toString()); + RegExp87 regex = new RegExp87(result.toString()); Automaton automaton = regex.toAutomaton(); ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton); BytesRef br = new BytesRef(randomValue);