Add painless string split function (splitOnToken) (#39772)
Adds two String split functions to Painless that can be used without enabling regexes.
This commit is contained in:
parent
b23b06dded
commit
99a50ac3b7
|
@ -1253,6 +1253,8 @@ See the <<painless-api-reference-shared, Shared API>> for a high-level overview
|
|||
* String {java11-javadoc}/java.base/java/lang/String.html#replace(java.lang.CharSequence,java.lang.CharSequence)[replace](CharSequence, CharSequence)
|
||||
* String replaceAll(Pattern, Function)
|
||||
* String replaceFirst(Pattern, Function)
|
||||
* String[] splitOnToken(String)
|
||||
* String[] splitOnToken(String, int)
|
||||
* boolean {java11-javadoc}/java.base/java/lang/String.html#startsWith(java.lang.String)[startsWith](String)
|
||||
* boolean {java11-javadoc}/java.base/java/lang/String.html#startsWith(java.lang.String,int)[startsWith](String, int)
|
||||
* CharSequence {java11-javadoc}/java.base/java/lang/CharSequence.html#subSequence(int,int)[subSequence](int, int)
|
||||
|
|
|
@ -503,4 +503,53 @@ public class Augmentation {
|
|||
public static String decodeBase64(String receiver) {
|
||||
return new String(Base64.getDecoder().decode(receiver.getBytes(StandardCharsets.UTF_8)), StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
/**
|
||||
* Split 'receiver' by 'token' as many times as possible..
|
||||
*/
|
||||
public static String[] splitOnToken(String receiver, String token) {
|
||||
return splitOnToken(receiver, token, -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Split 'receiver' by 'token' up to 'limit' times. Any limit less than 1 is ignored.
|
||||
*/
|
||||
public static String[] splitOnToken(String receiver, String token, int limit) {
|
||||
// Check if it's even possible to perform a split
|
||||
if (receiver == null || receiver.length() == 0 || token == null || token.length() == 0 || receiver.length() < token.length()) {
|
||||
return new String[] { receiver };
|
||||
}
|
||||
|
||||
// List of string segments we have found
|
||||
ArrayList<String> result = new ArrayList<String>();
|
||||
|
||||
// Keep track of where we are in the string
|
||||
// indexOf(tok, startPos) is faster than creating a new search context ever loop with substring(start, end)
|
||||
int pos = 0;
|
||||
|
||||
// Loop until we hit the limit or forever if we are passed in less than one (signifying no limit)
|
||||
// If Integer.MIN_VALUE is passed in, it will still continue to loop down to 1 from MAX_VALUE
|
||||
// This edge case should be fine as we are limited by receiver length (Integer.MAX_VALUE) even if we split at every char
|
||||
for(;limit != 1; limit--) {
|
||||
|
||||
// Find the next occurrence of token after current pos
|
||||
int idx = receiver.indexOf(token, pos);
|
||||
|
||||
// Reached the end of the string without another match
|
||||
if (idx == -1) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Add the found segment to the result list
|
||||
result.add(receiver.substring(pos, idx));
|
||||
|
||||
// Move our search position to the next possible location
|
||||
pos = idx + token.length();
|
||||
}
|
||||
// Add the remaining string to the result list
|
||||
result.add(receiver.substring(pos));
|
||||
|
||||
// O(N) or faster depending on implementation
|
||||
return result.toArray(new String[0]);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -758,6 +758,8 @@ class java.lang.String {
|
|||
String copyValueOf(char[],int,int)
|
||||
String org.elasticsearch.painless.api.Augmentation decodeBase64()
|
||||
String org.elasticsearch.painless.api.Augmentation encodeBase64()
|
||||
String[] org.elasticsearch.painless.api.Augmentation splitOnToken(String)
|
||||
String[] org.elasticsearch.painless.api.Augmentation splitOnToken(String, int)
|
||||
boolean endsWith(String)
|
||||
boolean equalsIgnoreCase(String)
|
||||
String format(Locale,String,def[])
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Arrays;
|
|||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class AugmentationTests extends ScriptTestCase {
|
||||
|
||||
|
@ -199,4 +200,43 @@ public class AugmentationTests extends ScriptTestCase {
|
|||
assertEquals(8, exec("def ft = new org.elasticsearch.painless.FeatureTestObject();" +
|
||||
" ft.setX(3); ft.setY(2); return ft.addToTotal(3)"));
|
||||
}
|
||||
|
||||
private static class SplitCase {
|
||||
final String input;
|
||||
final String token;
|
||||
final int count;
|
||||
|
||||
SplitCase(String input, String token, int count) {
|
||||
this.input = input;
|
||||
this.token = token;
|
||||
this.count = count;
|
||||
}
|
||||
SplitCase(String input, String token) {
|
||||
this(input, token, -1);
|
||||
}
|
||||
}
|
||||
public void testString_SplitOnToken() {
|
||||
SplitCase[] cases = new SplitCase[] {
|
||||
new SplitCase("", ""),
|
||||
new SplitCase("a,b,c", ","),
|
||||
new SplitCase("a,b,c", "|"),
|
||||
new SplitCase("a,,b,,c", ","),
|
||||
new SplitCase("a,,b,,c", ",", 1),
|
||||
new SplitCase("a,,b,,c", ",", 3),
|
||||
new SplitCase("a,,b,,c", ",", 300),
|
||||
new SplitCase("a,b,c", "a,b,c,d"),
|
||||
new SplitCase("aaaaaaa", "a"),
|
||||
new SplitCase("aaaaaaa", "a", 2),
|
||||
new SplitCase("1.1.1.1.111", "1"),
|
||||
new SplitCase("1.1.1.1.111", "."),
|
||||
new SplitCase("1\n1.1.\r\n1\r\n111", "\r\n"),
|
||||
};
|
||||
for (SplitCase split : cases) {
|
||||
//System.out.println(String.format("Splitting '%s' by '%s' %d times", split.input, split.token, split.count));
|
||||
assertArrayEquals(
|
||||
split.input.split(Pattern.quote(split.token), split.count),
|
||||
(String[])exec("return \""+split.input+"\".splitOnToken(\""+split.token+"\", "+split.count+");")
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue