[ML][Inference] fixing pattern compilation + unnecessary string copy (#51483) (#51487)

This commit is contained in:
Benjamin Trent 2020-01-27 12:12:34 -05:00 committed by GitHub
parent 40bd271f53
commit 8559ff7cee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 12 additions and 8 deletions

View File

@ -12,12 +12,17 @@ import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction; import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Locale; import java.util.Locale;
import java.util.regex.Pattern;
/** /**
* A collection of messy feature extractors * A collection of messy feature extractors
*/ */
public final class FeatureUtils { public final class FeatureUtils {
private static final Pattern NOT_UNICODE_OR_IS_SPECIAL = Pattern.compile("[^\\p{L}|\\p{M}|\\s]|\\||\\p{InSpacing_Modifier_Letters}");
private static final Pattern ONE_OR_MORE_WHITESPACE = Pattern.compile("\\p{IsWhite_Space}+");
private static final Pattern TURKISH_I = Pattern.compile("\\u0130");
private FeatureUtils() {} private FeatureUtils() {}
/** /**
@ -56,20 +61,18 @@ public final class FeatureUtils {
String newText = text.startsWith(" ") ? "" : " "; String newText = text.startsWith(" ") ? "" : " ";
// 2. Replace punctuation and whitespace with ' ' // 2. Replace punctuation and whitespace with ' '
// NOTE: we capture unicode letters AND marks as Nepalese and other languages
newText += text.replaceAll("[^\\p{L}|\\p{M}|\\s]|\\|", " ");
// 2.1. Replace spacing modifier characters // 2.1. Replace spacing modifier characters
newText = newText.replaceAll("\\p{InSpacing_Modifier_Letters}", " "); // NOTE: we capture unicode letters AND marks as Nepalese and other languages
newText += NOT_UNICODE_OR_IS_SPECIAL.matcher(text).replaceAll(" ");
// 3. Add space at end // 3. Add space at end
newText += text.endsWith(" ") ? "" : " "; newText += text.endsWith(" ") ? "" : " ";
// 4. Remove multiple spaces (2 or more) with a single space // 4. Remove multiple spaces (2 or more) with a single space
newText = newText.replaceAll("\\p{IsWhite_Space}+", " "); newText = ONE_OR_MORE_WHITESPACE.matcher(newText).replaceAll(" ");
// 5. Replace Turkish İ with I (TODO - check this out better...) // 5. Replace Turkish İ with I (TODO - check this out better...)
newText = newText.replaceAll("\\u0130", "I"); newText = TURKISH_I.matcher(newText).replaceAll("I");
return newText.toLowerCase(Locale.ROOT); return newText.toLowerCase(Locale.ROOT);
} }

View File

@ -55,13 +55,14 @@ public class NGramFeatureExtractor implements FeatureExtractor {
Map<String, Counter> charNGrams = new TreeMap<>(); Map<String, Counter> charNGrams = new TreeMap<>();
int countSum = 0; int countSum = 0;
int end = newText.toString().length() - nGrams; String textWithTerminators = newText.toString();
int end = textWithTerminators.length() - nGrams;
for (int start = 0; start <= end; ++start) { for (int start = 0; start <= end; ++start) {
StringBuilder charNGram = new StringBuilder(); StringBuilder charNGram = new StringBuilder();
int index; int index;
for (index = 0; index < nGrams; ++index) { for (index = 0; index < nGrams; ++index) {
char currentChar = newText.toString().charAt(start + index); char currentChar = textWithTerminators.charAt(start + index);
if (currentChar == ' ') { if (currentChar == ' ') {
break; break;
} }