Shortcut simple patterns ending in `*` (#43904)

When profiling a call to `AllocationService#reroute()` in a large cluster
containing allocation filters of the form `node-name-*` I observed a nontrivial
amount of time spent in `Regex#simpleMatch` due to these allocation filters.
Patterns ending in a wildcard are not uncommon, and this change treats them as
a special case in `Regex#simpleMatch` in order to shave a bit of time off this
calculation. It also uses `String#regionMatches()` to avoid an allocation in
the case that the pattern's only wildcard is at the start.

Microbenchmark results before this change:

    Result "org.elasticsearch.common.regex.RegexStartsWithBenchmark.performSimpleMatch":
      1113.839 ±(99.9%) 6.338 ns/op [Average]
      (min, avg, max) = (1102.388, 1113.839, 1135.783), stdev = 9.486
      CI (99.9%): [1107.502, 1120.177] (assumes normal distribution)

Microbenchmark results with this change applied:

    Result "org.elasticsearch.common.regex.RegexStartsWithBenchmark.performSimpleMatch":
      433.190 ±(99.9%) 0.644 ns/op [Average]
      (min, avg, max) = (431.518, 433.190, 435.456), stdev = 0.964
      CI (99.9%): [432.546, 433.833] (assumes normal distribution)

The microbenchmark in question was:

    @Fork(3)
    @Warmup(iterations = 10)
    @Measurement(iterations = 10)
    @BenchmarkMode(Mode.AverageTime)
    @OutputTimeUnit(TimeUnit.NANOSECONDS)
    @State(Scope.Benchmark)
    @SuppressWarnings("unused") //invoked by benchmarking framework
    public class RegexStartsWithBenchmark {

        private static final String testString = "abcdefghijklmnopqrstuvwxyz";
        private static final String[] patterns;

        static {
            patterns = new String[testString.length() + 1];
            for (int i = 0; i <= testString.length(); i++) {
                patterns[i] = testString.substring(0, i) + "*";
            }
        }

        @Benchmark
        public void performSimpleMatch() {
            for (int i = 0; i < patterns.length; i++) {
                Regex.simpleMatch(patterns[i], testString);
            }
        }
    }
This commit is contained in:
David Turner 2019-07-03 14:14:03 +01:00
parent 3317169c4f
commit 9cecc31cdc
2 changed files with 28 additions and 7 deletions

View File

@ -88,7 +88,7 @@ public class Regex {
if (pattern == null || str == null) {
return false;
}
int firstIndex = pattern.indexOf('*');
final int firstIndex = pattern.indexOf('*');
if (firstIndex == -1) {
return pattern.equals(str);
}
@ -96,14 +96,15 @@ public class Regex {
if (pattern.length() == 1) {
return true;
}
int nextIndex = pattern.indexOf('*', firstIndex + 1);
final int nextIndex = pattern.indexOf('*', firstIndex + 1);
if (nextIndex == -1) {
return str.endsWith(pattern.substring(1));
// str.endsWith(pattern.substring(1)), but avoiding the construction of pattern.substring(1):
return str.regionMatches(str.length() - pattern.length() + 1, pattern, 1, pattern.length() - 1);
} else if (nextIndex == 1) {
// Double wildcard "**" - skipping the first "*"
return simpleMatch(pattern.substring(1), str);
}
String part = pattern.substring(1, nextIndex);
final String part = pattern.substring(1, nextIndex);
int partIndex = str.indexOf(part);
while (partIndex != -1) {
if (simpleMatch(pattern.substring(nextIndex), str.substring(partIndex + part.length()))) {
@ -113,9 +114,9 @@ public class Regex {
}
return false;
}
return (str.length() >= firstIndex &&
pattern.substring(0, firstIndex).equals(str.substring(0, firstIndex)) &&
simpleMatch(pattern.substring(firstIndex), str.substring(firstIndex)));
return str.regionMatches(0, pattern, 0, firstIndex)
&& (firstIndex == pattern.length() - 1 // only wildcard in pattern is at the end, so no need to look at the rest of the string
|| simpleMatch(pattern.substring(firstIndex), str.substring(firstIndex)));
}
/**

View File

@ -63,4 +63,24 @@ public class RegexTests extends ESTestCase {
assertTrue(Regex.simpleMatch("fff*******ddd", "fffabcddd"));
assertFalse(Regex.simpleMatch("fff******ddd", "fffabcdd"));
}
public void testSimpleMatch() {
for (int i = 0; i < 1000; i++) {
final String matchingString = randomAlphaOfLength(between(0, 50));
// construct a pattern that matches this string by repeatedly replacing random substrings with '*' characters
String pattern = matchingString;
for (int shrink = between(0, 5); shrink > 0; shrink--) {
final int shrinkStart = between(0, pattern.length());
final int shrinkEnd = between(shrinkStart, pattern.length());
pattern = pattern.substring(0, shrinkStart) + "*" + pattern.substring(shrinkEnd);
}
assertTrue("[" + pattern + "] should match [" + matchingString + "]", Regex.simpleMatch(pattern, matchingString));
// construct a pattern that does not match this string by inserting a non-matching character (a digit)
final int insertPos = between(0, pattern.length());
pattern = pattern.substring(0, insertPos) + between(0, 9) + pattern.substring(insertPos);
assertFalse("[" + pattern + "] should not match [" + matchingString + "]", Regex.simpleMatch(pattern, matchingString));
}
}
}