Shortcut simple patterns ending in `*` (#43904)
When profiling a call to `AllocationService#reroute()` in a large cluster containing allocation filters of the form `node-name-*` I observed a nontrivial amount of time spent in `Regex#simpleMatch` due to these allocation filters. Patterns ending in a wildcard are not uncommon, and this change treats them as a special case in `Regex#simpleMatch` in order to shave a bit of time off this calculation. It also uses `String#regionMatches()` to avoid an allocation in the case that the pattern's only wildcard is at the start. Microbenchmark results before this change: Result "org.elasticsearch.common.regex.RegexStartsWithBenchmark.performSimpleMatch": 1113.839 ±(99.9%) 6.338 ns/op [Average] (min, avg, max) = (1102.388, 1113.839, 1135.783), stdev = 9.486 CI (99.9%): [1107.502, 1120.177] (assumes normal distribution) Microbenchmark results with this change applied: Result "org.elasticsearch.common.regex.RegexStartsWithBenchmark.performSimpleMatch": 433.190 ±(99.9%) 0.644 ns/op [Average] (min, avg, max) = (431.518, 433.190, 435.456), stdev = 0.964 CI (99.9%): [432.546, 433.833] (assumes normal distribution) The microbenchmark in question was: @Fork(3) @Warmup(iterations = 10) @Measurement(iterations = 10) @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.NANOSECONDS) @State(Scope.Benchmark) @SuppressWarnings("unused") //invoked by benchmarking framework public class RegexStartsWithBenchmark { private static final String testString = "abcdefghijklmnopqrstuvwxyz"; private static final String[] patterns; static { patterns = new String[testString.length() + 1]; for (int i = 0; i <= testString.length(); i++) { patterns[i] = testString.substring(0, i) + "*"; } } @Benchmark public void performSimpleMatch() { for (int i = 0; i < patterns.length; i++) { Regex.simpleMatch(patterns[i], testString); } } }
This commit is contained in:
parent
3317169c4f
commit
9cecc31cdc
|
@ -88,7 +88,7 @@ public class Regex {
|
||||||
if (pattern == null || str == null) {
|
if (pattern == null || str == null) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
int firstIndex = pattern.indexOf('*');
|
final int firstIndex = pattern.indexOf('*');
|
||||||
if (firstIndex == -1) {
|
if (firstIndex == -1) {
|
||||||
return pattern.equals(str);
|
return pattern.equals(str);
|
||||||
}
|
}
|
||||||
|
@ -96,14 +96,15 @@ public class Regex {
|
||||||
if (pattern.length() == 1) {
|
if (pattern.length() == 1) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
int nextIndex = pattern.indexOf('*', firstIndex + 1);
|
final int nextIndex = pattern.indexOf('*', firstIndex + 1);
|
||||||
if (nextIndex == -1) {
|
if (nextIndex == -1) {
|
||||||
return str.endsWith(pattern.substring(1));
|
// str.endsWith(pattern.substring(1)), but avoiding the construction of pattern.substring(1):
|
||||||
|
return str.regionMatches(str.length() - pattern.length() + 1, pattern, 1, pattern.length() - 1);
|
||||||
} else if (nextIndex == 1) {
|
} else if (nextIndex == 1) {
|
||||||
// Double wildcard "**" - skipping the first "*"
|
// Double wildcard "**" - skipping the first "*"
|
||||||
return simpleMatch(pattern.substring(1), str);
|
return simpleMatch(pattern.substring(1), str);
|
||||||
}
|
}
|
||||||
String part = pattern.substring(1, nextIndex);
|
final String part = pattern.substring(1, nextIndex);
|
||||||
int partIndex = str.indexOf(part);
|
int partIndex = str.indexOf(part);
|
||||||
while (partIndex != -1) {
|
while (partIndex != -1) {
|
||||||
if (simpleMatch(pattern.substring(nextIndex), str.substring(partIndex + part.length()))) {
|
if (simpleMatch(pattern.substring(nextIndex), str.substring(partIndex + part.length()))) {
|
||||||
|
@ -113,9 +114,9 @@ public class Regex {
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return (str.length() >= firstIndex &&
|
return str.regionMatches(0, pattern, 0, firstIndex)
|
||||||
pattern.substring(0, firstIndex).equals(str.substring(0, firstIndex)) &&
|
&& (firstIndex == pattern.length() - 1 // only wildcard in pattern is at the end, so no need to look at the rest of the string
|
||||||
simpleMatch(pattern.substring(firstIndex), str.substring(firstIndex)));
|
|| simpleMatch(pattern.substring(firstIndex), str.substring(firstIndex)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -63,4 +63,24 @@ public class RegexTests extends ESTestCase {
|
||||||
assertTrue(Regex.simpleMatch("fff*******ddd", "fffabcddd"));
|
assertTrue(Regex.simpleMatch("fff*******ddd", "fffabcddd"));
|
||||||
assertFalse(Regex.simpleMatch("fff******ddd", "fffabcdd"));
|
assertFalse(Regex.simpleMatch("fff******ddd", "fffabcdd"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testSimpleMatch() {
|
||||||
|
for (int i = 0; i < 1000; i++) {
|
||||||
|
final String matchingString = randomAlphaOfLength(between(0, 50));
|
||||||
|
|
||||||
|
// construct a pattern that matches this string by repeatedly replacing random substrings with '*' characters
|
||||||
|
String pattern = matchingString;
|
||||||
|
for (int shrink = between(0, 5); shrink > 0; shrink--) {
|
||||||
|
final int shrinkStart = between(0, pattern.length());
|
||||||
|
final int shrinkEnd = between(shrinkStart, pattern.length());
|
||||||
|
pattern = pattern.substring(0, shrinkStart) + "*" + pattern.substring(shrinkEnd);
|
||||||
|
}
|
||||||
|
assertTrue("[" + pattern + "] should match [" + matchingString + "]", Regex.simpleMatch(pattern, matchingString));
|
||||||
|
|
||||||
|
// construct a pattern that does not match this string by inserting a non-matching character (a digit)
|
||||||
|
final int insertPos = between(0, pattern.length());
|
||||||
|
pattern = pattern.substring(0, insertPos) + between(0, 9) + pattern.substring(insertPos);
|
||||||
|
assertFalse("[" + pattern + "] should not match [" + matchingString + "]", Regex.simpleMatch(pattern, matchingString));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue