Add painless string split function (splitOnToken) (#39772)

Adds two String split functions to Painless that can be used without enabling regexes.
This commit is contained in:
Christian Mesh 2019-05-09 18:14:23 -04:00 committed by Jack Conradson
parent b23b06dded
commit 99a50ac3b7
4 changed files with 93 additions and 0 deletions

View File

@ -1253,6 +1253,8 @@ See the <<painless-api-reference-shared, Shared API>> for a high-level overview
* String {java11-javadoc}/java.base/java/lang/String.html#replace(java.lang.CharSequence,java.lang.CharSequence)[replace](CharSequence, CharSequence)
* String replaceAll(Pattern, Function)
* String replaceFirst(Pattern, Function)
* String[] splitOnToken(String)
* String[] splitOnToken(String, int)
* boolean {java11-javadoc}/java.base/java/lang/String.html#startsWith(java.lang.String)[startsWith](String)
* boolean {java11-javadoc}/java.base/java/lang/String.html#startsWith(java.lang.String,int)[startsWith](String, int)
* CharSequence {java11-javadoc}/java.base/java/lang/CharSequence.html#subSequence(int,int)[subSequence](int, int)

View File

@ -503,4 +503,53 @@ public class Augmentation {
public static String decodeBase64(String receiver) {
return new String(Base64.getDecoder().decode(receiver.getBytes(StandardCharsets.UTF_8)), StandardCharsets.UTF_8);
}
/**
* Split 'receiver' by 'token' as many times as possible..
*/
public static String[] splitOnToken(String receiver, String token) {
return splitOnToken(receiver, token, -1);
}
/**
* Split 'receiver' by 'token' up to 'limit' times. Any limit less than 1 is ignored.
*/
public static String[] splitOnToken(String receiver, String token, int limit) {
// Check if it's even possible to perform a split
if (receiver == null || receiver.length() == 0 || token == null || token.length() == 0 || receiver.length() < token.length()) {
return new String[] { receiver };
}
// List of string segments we have found
ArrayList<String> result = new ArrayList<String>();
// Keep track of where we are in the string
// indexOf(tok, startPos) is faster than creating a new search context ever loop with substring(start, end)
int pos = 0;
// Loop until we hit the limit or forever if we are passed in less than one (signifying no limit)
// If Integer.MIN_VALUE is passed in, it will still continue to loop down to 1 from MAX_VALUE
// This edge case should be fine as we are limited by receiver length (Integer.MAX_VALUE) even if we split at every char
for(;limit != 1; limit--) {
// Find the next occurrence of token after current pos
int idx = receiver.indexOf(token, pos);
// Reached the end of the string without another match
if (idx == -1) {
break;
}
// Add the found segment to the result list
result.add(receiver.substring(pos, idx));
// Move our search position to the next possible location
pos = idx + token.length();
}
// Add the remaining string to the result list
result.add(receiver.substring(pos));
// O(N) or faster depending on implementation
return result.toArray(new String[0]);
}
}

View File

@ -758,6 +758,8 @@ class java.lang.String {
String copyValueOf(char[],int,int)
String org.elasticsearch.painless.api.Augmentation decodeBase64()
String org.elasticsearch.painless.api.Augmentation encodeBase64()
String[] org.elasticsearch.painless.api.Augmentation splitOnToken(String)
String[] org.elasticsearch.painless.api.Augmentation splitOnToken(String, int)
boolean endsWith(String)
boolean equalsIgnoreCase(String)
String format(Locale,String,def[])

View File

@ -23,6 +23,7 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
public class AugmentationTests extends ScriptTestCase {
@ -199,4 +200,43 @@ public class AugmentationTests extends ScriptTestCase {
assertEquals(8, exec("def ft = new org.elasticsearch.painless.FeatureTestObject();" +
" ft.setX(3); ft.setY(2); return ft.addToTotal(3)"));
}
private static class SplitCase {
final String input;
final String token;
final int count;
SplitCase(String input, String token, int count) {
this.input = input;
this.token = token;
this.count = count;
}
SplitCase(String input, String token) {
this(input, token, -1);
}
}
public void testString_SplitOnToken() {
SplitCase[] cases = new SplitCase[] {
new SplitCase("", ""),
new SplitCase("a,b,c", ","),
new SplitCase("a,b,c", "|"),
new SplitCase("a,,b,,c", ","),
new SplitCase("a,,b,,c", ",", 1),
new SplitCase("a,,b,,c", ",", 3),
new SplitCase("a,,b,,c", ",", 300),
new SplitCase("a,b,c", "a,b,c,d"),
new SplitCase("aaaaaaa", "a"),
new SplitCase("aaaaaaa", "a", 2),
new SplitCase("1.1.1.1.111", "1"),
new SplitCase("1.1.1.1.111", "."),
new SplitCase("1\n1.1.\r\n1\r\n111", "\r\n"),
};
for (SplitCase split : cases) {
//System.out.println(String.format("Splitting '%s' by '%s' %d times", split.input, split.token, split.count));
assertArrayEquals(
split.input.split(Pattern.quote(split.token), split.count),
(String[])exec("return \""+split.input+"\".splitOnToken(\""+split.token+"\", "+split.count+");")
);
}
}
}