From 8559ff7cee6367dcdd8df781272bc98c7f073c5d Mon Sep 17 00:00:00 2001
From: Benjamin Trent <ben.w.trent@gmail.com>
Date: Mon, 27 Jan 2020 12:12:34 -0500
Subject: [PATCH] [ML][Inference] fixing pattern compilation + unnecessary
 string copy (#51483) (#51487)

---
 .../customwordembedding/FeatureUtils.java         | 15 +++++++++------
 .../NGramFeatureExtractor.java                    |  5 +++--
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/FeatureUtils.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/FeatureUtils.java
index 7224ee41086..04ff817c0be 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/FeatureUtils.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/FeatureUtils.java
@@ -12,12 +12,17 @@ import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.nio.charset.StandardCharsets;
 import java.util.Locale;
+import java.util.regex.Pattern;
 
 /**
  * A collection of messy feature extractors
  */
 public final class FeatureUtils {
 
+    private static final Pattern NOT_UNICODE_OR_IS_SPECIAL = Pattern.compile("[^\\p{L}|\\p{M}|\\s]|\\||\\p{InSpacing_Modifier_Letters}");
+    private static final Pattern ONE_OR_MORE_WHITESPACE = Pattern.compile("\\p{IsWhite_Space}+");
+    private static final Pattern TURKISH_I = Pattern.compile("\\u0130");
+
     private FeatureUtils() {}
 
     /**
@@ -56,20 +61,18 @@ public final class FeatureUtils {
         String newText = text.startsWith(" ") ? "" : " ";
 
         // 2. Replace punctuation and whitespace with ' '
-        // NOTE: we capture unicode letters AND marks as Nepalese and other languages
-        newText += text.replaceAll("[^\\p{L}|\\p{M}|\\s]|\\|", " ");
-
         // 2.1. Replace spacing modifier characters
-        newText = newText.replaceAll("\\p{InSpacing_Modifier_Letters}", " ");
+        // NOTE: we capture unicode letters AND marks as Nepalese and other languages
+        newText += NOT_UNICODE_OR_IS_SPECIAL.matcher(text).replaceAll(" ");
 
         // 3. Add space at end
         newText += text.endsWith(" ") ? "" : " ";
 
         // 4. Remove multiple spaces (2 or more) with a single space
-        newText = newText.replaceAll("\\p{IsWhite_Space}+", " ");
+        newText = ONE_OR_MORE_WHITESPACE.matcher(newText).replaceAll(" ");
 
         // 5. Replace Turkish İ with I (TODO - check this out better...)
-        newText = newText.replaceAll("\\u0130", "I");
+        newText = TURKISH_I.matcher(newText).replaceAll("I");
 
         return newText.toLowerCase(Locale.ROOT);
     }
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/NGramFeatureExtractor.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/NGramFeatureExtractor.java
index 06dc68e1f28..a5827369764 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/NGramFeatureExtractor.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/NGramFeatureExtractor.java
@@ -55,13 +55,14 @@ public class NGramFeatureExtractor implements FeatureExtractor {
         Map<String, Counter> charNGrams = new TreeMap<>();
 
         int countSum = 0;
-        int end = newText.toString().length() - nGrams;
+        String textWithTerminators = newText.toString();
+        int end = textWithTerminators.length() - nGrams;
         for (int start = 0; start <= end; ++start) {
             StringBuilder charNGram = new StringBuilder();
 
             int index;
             for (index = 0; index < nGrams; ++index) {
-                char currentChar = newText.toString().charAt(start + index);
+                char currentChar = textWithTerminators.charAt(start + index);
                 if (currentChar == ' ') {
                     break;
                 }