[ML][Inference] fixing pattern compilation + unnecessary string copy (#51483) (#51487)

2025-03-06 10:59:12 +00:00 · 2020-01-27 12:12:34 -05:00 · 2020-01-27 12:12:34 -05:00 · 8559ff7cee
commit 8559ff7cee
parent 40bd271f53
2 changed files with 12 additions and 8 deletions
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/FeatureUtils.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/FeatureUtils.java
@ -12,12 +12,17 @@ import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.nio.charset.StandardCharsets;
 import java.util.Locale;
+import java.util.regex.Pattern;

 /**
 * A collection of messy feature extractors
 */
 public final class FeatureUtils {

+    private static final Pattern NOT_UNICODE_OR_IS_SPECIAL = Pattern.compile("[^\\p{L}|\\p{M}|\\s]|\\||\\p{InSpacing_Modifier_Letters}");
+    private static final Pattern ONE_OR_MORE_WHITESPACE = Pattern.compile("\\p{IsWhite_Space}+");
+    private static final Pattern TURKISH_I = Pattern.compile("\\u0130");
+
    private FeatureUtils() {}

    /**
@ -56,20 +61,18 @@ public final class FeatureUtils {
        String newText = text.startsWith(" ") ? "" : " ";

        // 2. Replace punctuation and whitespace with ' '
-        // NOTE: we capture unicode letters AND marks as Nepalese and other languages
-        newText += text.replaceAll("[^\\p{L}|\\p{M}|\\s]|\\|", " ");
-
        // 2.1. Replace spacing modifier characters
-        newText = newText.replaceAll("\\p{InSpacing_Modifier_Letters}", " ");
+        // NOTE: we capture unicode letters AND marks as Nepalese and other languages
+        newText += NOT_UNICODE_OR_IS_SPECIAL.matcher(text).replaceAll(" ");

        // 3. Add space at end
        newText += text.endsWith(" ") ? "" : " ";

        // 4. Remove multiple spaces (2 or more) with a single space
-        newText = newText.replaceAll("\\p{IsWhite_Space}+", " ");
+        newText = ONE_OR_MORE_WHITESPACE.matcher(newText).replaceAll(" ");

        // 5. Replace Turkish İ with I (TODO - check this out better...)
-        newText = newText.replaceAll("\\u0130", "I");
+        newText = TURKISH_I.matcher(newText).replaceAll("I");

        return newText.toLowerCase(Locale.ROOT);
    }
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/NGramFeatureExtractor.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/NGramFeatureExtractor.java
@ -55,13 +55,14 @@ public class NGramFeatureExtractor implements FeatureExtractor {
        Map<String, Counter> charNGrams = new TreeMap<>();

        int countSum = 0;
-        int end = newText.toString().length() - nGrams;
+        String textWithTerminators = newText.toString();
+        int end = textWithTerminators.length() - nGrams;
        for (int start = 0; start <= end; ++start) {
            StringBuilder charNGram = new StringBuilder();

            int index;
            for (index = 0; index < nGrams; ++index) {
-                char currentChar = newText.toString().charAt(start + index);
+                char currentChar = textWithTerminators.charAt(start + index);
                if (currentChar == ' ') {
                    break;
                }