diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/FeatureUtils.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/FeatureUtils.java index 7224ee41086..04ff817c0be 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/FeatureUtils.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/FeatureUtils.java @@ -12,12 +12,17 @@ import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.nio.charset.StandardCharsets; import java.util.Locale; +import java.util.regex.Pattern; /** * A collection of messy feature extractors */ public final class FeatureUtils { + private static final Pattern NOT_UNICODE_OR_IS_SPECIAL = Pattern.compile("[^\\p{L}|\\p{M}|\\s]|\\||\\p{InSpacing_Modifier_Letters}"); + private static final Pattern ONE_OR_MORE_WHITESPACE = Pattern.compile("\\p{IsWhite_Space}+"); + private static final Pattern TURKISH_I = Pattern.compile("\\u0130"); + private FeatureUtils() {} /** @@ -56,20 +61,18 @@ public final class FeatureUtils { String newText = text.startsWith(" ") ? "" : " "; // 2. Replace punctuation and whitespace with ' ' - // NOTE: we capture unicode letters AND marks as Nepalese and other languages - newText += text.replaceAll("[^\\p{L}|\\p{M}|\\s]|\\|", " "); - // 2.1. Replace spacing modifier characters - newText = newText.replaceAll("\\p{InSpacing_Modifier_Letters}", " "); + // NOTE: we capture unicode letters AND marks as Nepalese and other languages + newText += NOT_UNICODE_OR_IS_SPECIAL.matcher(text).replaceAll(" "); // 3. Add space at end newText += text.endsWith(" ") ? "" : " "; // 4. Remove multiple spaces (2 or more) with a single space - newText = newText.replaceAll("\\p{IsWhite_Space}+", " "); + newText = ONE_OR_MORE_WHITESPACE.matcher(newText).replaceAll(" "); // 5. Replace Turkish İ with I (TODO - check this out better...) - newText = newText.replaceAll("\\u0130", "I"); + newText = TURKISH_I.matcher(newText).replaceAll("I"); return newText.toLowerCase(Locale.ROOT); } diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/NGramFeatureExtractor.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/NGramFeatureExtractor.java index 06dc68e1f28..a5827369764 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/NGramFeatureExtractor.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/NGramFeatureExtractor.java @@ -55,13 +55,14 @@ public class NGramFeatureExtractor implements FeatureExtractor { Map charNGrams = new TreeMap<>(); int countSum = 0; - int end = newText.toString().length() - nGrams; + String textWithTerminators = newText.toString(); + int end = textWithTerminators.length() - nGrams; for (int start = 0; start <= end; ++start) { StringBuilder charNGram = new StringBuilder(); int index; for (index = 0; index < nGrams; ++index) { - char currentChar = newText.toString().charAt(start + index); + char currentChar = textWithTerminators.charAt(start + index); if (currentChar == ' ') { break; }