Case Insensitive Support in Regexp Interval (#2237)

Add a `case_insensitive` flag to regexp interval source.

Signed-off-by: Matt Weber <matt@mattweber.org>
This commit is contained in:
Matt Weber 2022-02-24 12:54:13 -08:00 committed by GitHub
parent 788ba99915
commit 37235fafd9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 111 additions and 11 deletions

View File

@ -25,6 +25,56 @@ setup:
- '{"index": {"_index": "test", "_id": "6"}}' - '{"index": {"_index": "test", "_id": "6"}}'
- '{"text" : "that is some cold cold rain"}' - '{"text" : "that is some cold cold rain"}'
---
"Test regexp":
- skip:
version: " - 1.2.99"
reason: "regexp introduced in 1.3"
- do:
search:
index: test
body:
query:
intervals:
text:
regexp:
pattern: "at[a-z]{2,}here"
- match: { hits.total.value: 1 }
---
"Test regexp, explicit case sensitive":
- skip:
version: " - 1.99.99"
reason: "case_insensitive introduced in 2.0"
- do:
search:
index: test
body:
query:
intervals:
text:
regexp:
pattern: "AT[a-z]{2,}HERE"
case_insensitive: false
- match: { hits.total.value: 0 }
---
"Test regexp, explicit case insensitive":
- skip:
version: " - 1.99.99"
reason: "case_insensitive introduced in 2.0"
- do:
search:
index: test
body:
query:
intervals:
text:
regexp:
pattern: "AT[a-z]{2,}HERE"
case_insensitive: true
- match: { hits.total.value: 1 }
--- ---
"Test ordered matching with via mode": "Test ordered matching with via mode":
- skip: - skip:

View File

@ -40,6 +40,7 @@ import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.opensearch.LegacyESVersion; import org.opensearch.LegacyESVersion;
import org.opensearch.Version; import org.opensearch.Version;
import org.opensearch.common.ParseField; import org.opensearch.common.ParseField;
@ -687,12 +688,20 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
private final int flags; private final int flags;
private final String useField; private final String useField;
private final Integer maxExpansions; private final Integer maxExpansions;
private final boolean caseInsensitive;
public Regexp(String pattern, int flags, String useField, Integer maxExpansions) { /**
* Constructor
*
* {@code flags} is Lucene's <a href="https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java#L391-L411">syntax flags</a>
* and {@code caseInsensitive} enables Lucene's only <a href="https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java#L416">matching flag</a>.
*/
public Regexp(String pattern, int flags, String useField, Integer maxExpansions, boolean caseInsensitive) {
this.pattern = pattern; this.pattern = pattern;
this.flags = flags; this.flags = flags;
this.useField = useField; this.useField = useField;
this.maxExpansions = (maxExpansions != null && maxExpansions > 0) ? maxExpansions : null; this.maxExpansions = (maxExpansions != null && maxExpansions > 0) ? maxExpansions : null;
this.caseInsensitive = caseInsensitive;
} }
public Regexp(StreamInput in) throws IOException { public Regexp(StreamInput in) throws IOException {
@ -700,11 +709,20 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
this.flags = in.readVInt(); this.flags = in.readVInt();
this.useField = in.readOptionalString(); this.useField = in.readOptionalString();
this.maxExpansions = in.readOptionalVInt(); this.maxExpansions = in.readOptionalVInt();
if (in.getVersion().onOrAfter(Version.V_2_0_0)) {
this.caseInsensitive = in.readBoolean();
} else {
this.caseInsensitive = false;
}
} }
@Override @Override
public IntervalsSource getSource(QueryShardContext context, MappedFieldType fieldType) { public IntervalsSource getSource(QueryShardContext context, MappedFieldType fieldType) {
final org.apache.lucene.util.automaton.RegExp regexp = new org.apache.lucene.util.automaton.RegExp(pattern, flags); final org.apache.lucene.util.automaton.RegExp regexp = new org.apache.lucene.util.automaton.RegExp(
pattern,
flags,
caseInsensitive ? RegExp.ASCII_CASE_INSENSITIVE : 0
);
final CompiledAutomaton automaton = new CompiledAutomaton(regexp.toAutomaton()); final CompiledAutomaton automaton = new CompiledAutomaton(regexp.toAutomaton());
if (useField != null) { if (useField != null) {
@ -745,12 +763,13 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
return Objects.equals(pattern, regexp.pattern) return Objects.equals(pattern, regexp.pattern)
&& Objects.equals(flags, regexp.flags) && Objects.equals(flags, regexp.flags)
&& Objects.equals(useField, regexp.useField) && Objects.equals(useField, regexp.useField)
&& Objects.equals(maxExpansions, regexp.maxExpansions); && Objects.equals(maxExpansions, regexp.maxExpansions)
&& Objects.equals(caseInsensitive, regexp.caseInsensitive);
} }
@Override @Override
public int hashCode() { public int hashCode() {
return Objects.hash(pattern, flags, useField, maxExpansions); return Objects.hash(pattern, flags, useField, maxExpansions, caseInsensitive);
} }
@Override @Override
@ -764,6 +783,9 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
out.writeVInt(flags); out.writeVInt(flags);
out.writeOptionalString(useField); out.writeOptionalString(useField);
out.writeOptionalVInt(maxExpansions); out.writeOptionalVInt(maxExpansions);
if (out.getVersion().onOrAfter(Version.V_2_0_0)) {
out.writeBoolean(caseInsensitive);
}
} }
@Override @Override
@ -779,6 +801,9 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
if (maxExpansions != null) { if (maxExpansions != null) {
builder.field("max_expansions", maxExpansions); builder.field("max_expansions", maxExpansions);
} }
if (caseInsensitive) {
builder.field("case_insensitive", caseInsensitive);
}
builder.endObject(); builder.endObject();
return builder; return builder;
} }
@ -789,13 +814,14 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
Integer flagsValue = (Integer) args[2]; Integer flagsValue = (Integer) args[2];
String useField = (String) args[3]; String useField = (String) args[3];
Integer maxExpansions = (Integer) args[4]; Integer maxExpansions = (Integer) args[4];
boolean caseInsensitive = args[5] != null && (boolean) args[5];
if (flagsValue != null) { if (flagsValue != null) {
return new Regexp(pattern, flagsValue, useField, maxExpansions); return new Regexp(pattern, flagsValue, useField, maxExpansions, caseInsensitive);
} else if (flags != null) { } else if (flags != null) {
return new Regexp(pattern, RegexpFlag.resolveValue(flags), useField, maxExpansions); return new Regexp(pattern, RegexpFlag.resolveValue(flags), useField, maxExpansions, caseInsensitive);
} else { } else {
return new Regexp(pattern, DEFAULT_FLAGS_VALUE, useField, maxExpansions); return new Regexp(pattern, DEFAULT_FLAGS_VALUE, useField, maxExpansions, caseInsensitive);
} }
}); });
static { static {
@ -804,6 +830,7 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
PARSER.declareInt(optionalConstructorArg(), new ParseField("flags_value")); PARSER.declareInt(optionalConstructorArg(), new ParseField("flags_value"));
PARSER.declareString(optionalConstructorArg(), new ParseField("use_field")); PARSER.declareString(optionalConstructorArg(), new ParseField("use_field"));
PARSER.declareInt(optionalConstructorArg(), new ParseField("max_expansions")); PARSER.declareInt(optionalConstructorArg(), new ParseField("max_expansions"));
PARSER.declareBoolean(optionalConstructorArg(), new ParseField("case_insensitive"));
} }
public static Regexp fromXContent(XContentParser parser) throws IOException { public static Regexp fromXContent(XContentParser parser) throws IOException {
@ -825,6 +852,10 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
Integer getMaxExpansions() { Integer getMaxExpansions() {
return maxExpansions; return maxExpansions;
} }
boolean isCaseInsensitive() {
return caseInsensitive;
}
} }
public static class Wildcard extends IntervalsSourceProvider { public static class Wildcard extends IntervalsSourceProvider {

View File

@ -846,7 +846,11 @@ public class IntervalQueryBuilderTests extends AbstractQueryTestCase<IntervalQue
} }
private static IntervalsSource buildRegexpSource(String pattern, int flags, Integer maxExpansions) { private static IntervalsSource buildRegexpSource(String pattern, int flags, Integer maxExpansions) {
final RegExp regexp = new RegExp(pattern, flags); return buildRegexpSource(pattern, flags, 0, maxExpansions);
}
private static IntervalsSource buildRegexpSource(String pattern, int flags, int matchFlags, Integer maxExpansions) {
final RegExp regexp = new RegExp(pattern, flags, matchFlags);
CompiledAutomaton automaton = new CompiledAutomaton(regexp.toAutomaton()); CompiledAutomaton automaton = new CompiledAutomaton(regexp.toAutomaton());
if (maxExpansions != null) { if (maxExpansions != null) {
@ -922,6 +926,15 @@ public class IntervalQueryBuilderTests extends AbstractQueryTestCase<IntervalQue
expected = new IntervalQuery(TEXT_FIELD_NAME, buildRegexpSource("te.m", DEFAULT_FLAGS, 500)); expected = new IntervalQuery(TEXT_FIELD_NAME, buildRegexpSource("te.m", DEFAULT_FLAGS, 500));
assertEquals(expected, builder.toQuery(createShardContext())); assertEquals(expected, builder.toQuery(createShardContext()));
String regexp_case_insensitive_json = "{ \"intervals\" : { \""
+ TEXT_FIELD_NAME
+ "\": { "
+ "\"regexp\" : { \"pattern\" : \"TE.M\", \"case_insensitive\" : true } } } }";
builder = (IntervalQueryBuilder) parseQuery(regexp_case_insensitive_json);
expected = new IntervalQuery(TEXT_FIELD_NAME, buildRegexpSource("TE.M", DEFAULT_FLAGS, RegExp.ASCII_CASE_INSENSITIVE, null));
assertEquals(expected, builder.toQuery(createShardContext()));
String regexp_neg_max_expand_json = "{ \"intervals\" : { \"" String regexp_neg_max_expand_json = "{ \"intervals\" : { \""
+ TEXT_FIELD_NAME + TEXT_FIELD_NAME
+ "\": { " + "\": { "

View File

@ -32,7 +32,8 @@ public class RegexpIntervalsSourceProviderTests extends AbstractSerializingTestC
randomAlphaOfLengthBetween(0, 3) + (randomBoolean() ? ".*?" : "." + randomAlphaOfLength(4)) + randomAlphaOfLengthBetween(0, 5), randomAlphaOfLengthBetween(0, 3) + (randomBoolean() ? ".*?" : "." + randomAlphaOfLength(4)) + randomAlphaOfLengthBetween(0, 5),
randomBoolean() ? RegexpFlag.resolveValue(randomFrom(FLAGS)) : RegexpFlag.ALL.value(), randomBoolean() ? RegexpFlag.resolveValue(randomFrom(FLAGS)) : RegexpFlag.ALL.value(),
randomBoolean() ? randomAlphaOfLength(10) : null, randomBoolean() ? randomAlphaOfLength(10) : null,
randomBoolean() ? randomIntBetween(-1, Integer.MAX_VALUE) : null randomBoolean() ? randomIntBetween(-1, Integer.MAX_VALUE) : null,
randomBoolean()
); );
} }
@ -42,7 +43,9 @@ public class RegexpIntervalsSourceProviderTests extends AbstractSerializingTestC
int flags = instance.getFlags(); int flags = instance.getFlags();
String useField = instance.getUseField(); String useField = instance.getUseField();
Integer maxExpansions = instance.getMaxExpansions(); Integer maxExpansions = instance.getMaxExpansions();
int ran = between(0, 3); boolean caseInsensitive = instance.isCaseInsensitive();
int ran = between(0, 4);
switch (ran) { switch (ran) {
case 0: case 0:
pattern += randomBoolean() ? ".*?" : randomAlphaOfLength(5); pattern += randomBoolean() ? ".*?" : randomAlphaOfLength(5);
@ -56,10 +59,13 @@ public class RegexpIntervalsSourceProviderTests extends AbstractSerializingTestC
case 3: case 3:
maxExpansions = maxExpansions == null ? randomIntBetween(1, Integer.MAX_VALUE) : null; maxExpansions = maxExpansions == null ? randomIntBetween(1, Integer.MAX_VALUE) : null;
break; break;
case 4:
caseInsensitive = !caseInsensitive;
break;
default: default:
throw new AssertionError("Illegal randomisation branch"); throw new AssertionError("Illegal randomisation branch");
} }
return new Regexp(pattern, flags, useField, maxExpansions); return new Regexp(pattern, flags, useField, maxExpansions, caseInsensitive);
} }
@Override @Override