Case Insensitive Support in Regexp Interval (#2237)

Add a `case_insensitive` flag to regexp interval source.

Signed-off-by: Matt Weber <matt@mattweber.org>
This commit is contained in:
Matt Weber 2022-02-24 12:54:13 -08:00 committed by GitHub
parent 788ba99915
commit 37235fafd9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 111 additions and 11 deletions

View File

@ -25,6 +25,56 @@ setup:
- '{"index": {"_index": "test", "_id": "6"}}'
- '{"text" : "that is some cold cold rain"}'
---
"Test regexp":
- skip:
version: " - 1.2.99"
reason: "regexp introduced in 1.3"
- do:
search:
index: test
body:
query:
intervals:
text:
regexp:
pattern: "at[a-z]{2,}here"
- match: { hits.total.value: 1 }
---
"Test regexp, explicit case sensitive":
- skip:
version: " - 1.99.99"
reason: "case_insensitive introduced in 2.0"
- do:
search:
index: test
body:
query:
intervals:
text:
regexp:
pattern: "AT[a-z]{2,}HERE"
case_insensitive: false
- match: { hits.total.value: 0 }
---
"Test regexp, explicit case insensitive":
- skip:
version: " - 1.99.99"
reason: "case_insensitive introduced in 2.0"
- do:
search:
index: test
body:
query:
intervals:
text:
regexp:
pattern: "AT[a-z]{2,}HERE"
case_insensitive: true
- match: { hits.total.value: 1 }
---
"Test ordered matching with via mode":
- skip:

View File

@ -40,6 +40,7 @@ import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.opensearch.LegacyESVersion;
import org.opensearch.Version;
import org.opensearch.common.ParseField;
@ -687,12 +688,20 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
private final int flags;
private final String useField;
private final Integer maxExpansions;
private final boolean caseInsensitive;
public Regexp(String pattern, int flags, String useField, Integer maxExpansions) {
/**
* Constructor
*
* {@code flags} is Lucene's <a href="https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java#L391-L411">syntax flags</a>
* and {@code caseInsensitive} enables Lucene's only <a href="https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java#L416">matching flag</a>.
*/
public Regexp(String pattern, int flags, String useField, Integer maxExpansions, boolean caseInsensitive) {
this.pattern = pattern;
this.flags = flags;
this.useField = useField;
this.maxExpansions = (maxExpansions != null && maxExpansions > 0) ? maxExpansions : null;
this.caseInsensitive = caseInsensitive;
}
public Regexp(StreamInput in) throws IOException {
@ -700,11 +709,20 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
this.flags = in.readVInt();
this.useField = in.readOptionalString();
this.maxExpansions = in.readOptionalVInt();
if (in.getVersion().onOrAfter(Version.V_2_0_0)) {
this.caseInsensitive = in.readBoolean();
} else {
this.caseInsensitive = false;
}
}
@Override
public IntervalsSource getSource(QueryShardContext context, MappedFieldType fieldType) {
final org.apache.lucene.util.automaton.RegExp regexp = new org.apache.lucene.util.automaton.RegExp(pattern, flags);
final org.apache.lucene.util.automaton.RegExp regexp = new org.apache.lucene.util.automaton.RegExp(
pattern,
flags,
caseInsensitive ? RegExp.ASCII_CASE_INSENSITIVE : 0
);
final CompiledAutomaton automaton = new CompiledAutomaton(regexp.toAutomaton());
if (useField != null) {
@ -745,12 +763,13 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
return Objects.equals(pattern, regexp.pattern)
&& Objects.equals(flags, regexp.flags)
&& Objects.equals(useField, regexp.useField)
&& Objects.equals(maxExpansions, regexp.maxExpansions);
&& Objects.equals(maxExpansions, regexp.maxExpansions)
&& Objects.equals(caseInsensitive, regexp.caseInsensitive);
}
@Override
public int hashCode() {
return Objects.hash(pattern, flags, useField, maxExpansions);
return Objects.hash(pattern, flags, useField, maxExpansions, caseInsensitive);
}
@Override
@ -764,6 +783,9 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
out.writeVInt(flags);
out.writeOptionalString(useField);
out.writeOptionalVInt(maxExpansions);
if (out.getVersion().onOrAfter(Version.V_2_0_0)) {
out.writeBoolean(caseInsensitive);
}
}
@Override
@ -779,6 +801,9 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
if (maxExpansions != null) {
builder.field("max_expansions", maxExpansions);
}
if (caseInsensitive) {
builder.field("case_insensitive", caseInsensitive);
}
builder.endObject();
return builder;
}
@ -789,13 +814,14 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
Integer flagsValue = (Integer) args[2];
String useField = (String) args[3];
Integer maxExpansions = (Integer) args[4];
boolean caseInsensitive = args[5] != null && (boolean) args[5];
if (flagsValue != null) {
return new Regexp(pattern, flagsValue, useField, maxExpansions);
return new Regexp(pattern, flagsValue, useField, maxExpansions, caseInsensitive);
} else if (flags != null) {
return new Regexp(pattern, RegexpFlag.resolveValue(flags), useField, maxExpansions);
return new Regexp(pattern, RegexpFlag.resolveValue(flags), useField, maxExpansions, caseInsensitive);
} else {
return new Regexp(pattern, DEFAULT_FLAGS_VALUE, useField, maxExpansions);
return new Regexp(pattern, DEFAULT_FLAGS_VALUE, useField, maxExpansions, caseInsensitive);
}
});
static {
@ -804,6 +830,7 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
PARSER.declareInt(optionalConstructorArg(), new ParseField("flags_value"));
PARSER.declareString(optionalConstructorArg(), new ParseField("use_field"));
PARSER.declareInt(optionalConstructorArg(), new ParseField("max_expansions"));
PARSER.declareBoolean(optionalConstructorArg(), new ParseField("case_insensitive"));
}
public static Regexp fromXContent(XContentParser parser) throws IOException {
@ -825,6 +852,10 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
Integer getMaxExpansions() {
return maxExpansions;
}
boolean isCaseInsensitive() {
return caseInsensitive;
}
}
public static class Wildcard extends IntervalsSourceProvider {

View File

@ -846,7 +846,11 @@ public class IntervalQueryBuilderTests extends AbstractQueryTestCase<IntervalQue
}
private static IntervalsSource buildRegexpSource(String pattern, int flags, Integer maxExpansions) {
final RegExp regexp = new RegExp(pattern, flags);
return buildRegexpSource(pattern, flags, 0, maxExpansions);
}
private static IntervalsSource buildRegexpSource(String pattern, int flags, int matchFlags, Integer maxExpansions) {
final RegExp regexp = new RegExp(pattern, flags, matchFlags);
CompiledAutomaton automaton = new CompiledAutomaton(regexp.toAutomaton());
if (maxExpansions != null) {
@ -922,6 +926,15 @@ public class IntervalQueryBuilderTests extends AbstractQueryTestCase<IntervalQue
expected = new IntervalQuery(TEXT_FIELD_NAME, buildRegexpSource("te.m", DEFAULT_FLAGS, 500));
assertEquals(expected, builder.toQuery(createShardContext()));
String regexp_case_insensitive_json = "{ \"intervals\" : { \""
+ TEXT_FIELD_NAME
+ "\": { "
+ "\"regexp\" : { \"pattern\" : \"TE.M\", \"case_insensitive\" : true } } } }";
builder = (IntervalQueryBuilder) parseQuery(regexp_case_insensitive_json);
expected = new IntervalQuery(TEXT_FIELD_NAME, buildRegexpSource("TE.M", DEFAULT_FLAGS, RegExp.ASCII_CASE_INSENSITIVE, null));
assertEquals(expected, builder.toQuery(createShardContext()));
String regexp_neg_max_expand_json = "{ \"intervals\" : { \""
+ TEXT_FIELD_NAME
+ "\": { "

View File

@ -32,7 +32,8 @@ public class RegexpIntervalsSourceProviderTests extends AbstractSerializingTestC
randomAlphaOfLengthBetween(0, 3) + (randomBoolean() ? ".*?" : "." + randomAlphaOfLength(4)) + randomAlphaOfLengthBetween(0, 5),
randomBoolean() ? RegexpFlag.resolveValue(randomFrom(FLAGS)) : RegexpFlag.ALL.value(),
randomBoolean() ? randomAlphaOfLength(10) : null,
randomBoolean() ? randomIntBetween(-1, Integer.MAX_VALUE) : null
randomBoolean() ? randomIntBetween(-1, Integer.MAX_VALUE) : null,
randomBoolean()
);
}
@ -42,7 +43,9 @@ public class RegexpIntervalsSourceProviderTests extends AbstractSerializingTestC
int flags = instance.getFlags();
String useField = instance.getUseField();
Integer maxExpansions = instance.getMaxExpansions();
int ran = between(0, 3);
boolean caseInsensitive = instance.isCaseInsensitive();
int ran = between(0, 4);
switch (ran) {
case 0:
pattern += randomBoolean() ? ".*?" : randomAlphaOfLength(5);
@ -56,10 +59,13 @@ public class RegexpIntervalsSourceProviderTests extends AbstractSerializingTestC
case 3:
maxExpansions = maxExpansions == null ? randomIntBetween(1, Integer.MAX_VALUE) : null;
break;
case 4:
caseInsensitive = !caseInsensitive;
break;
default:
throw new AssertionError("Illegal randomisation branch");
}
return new Regexp(pattern, flags, useField, maxExpansions);
return new Regexp(pattern, flags, useField, maxExpansions, caseInsensitive);
}
@Override