This commit is contained in:
parent
40bd271f53
commit
8559ff7cee
|
@ -12,12 +12,17 @@ import java.nio.charset.CharsetDecoder;
|
||||||
import java.nio.charset.CodingErrorAction;
|
import java.nio.charset.CodingErrorAction;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A collection of messy feature extractors
|
* A collection of messy feature extractors
|
||||||
*/
|
*/
|
||||||
public final class FeatureUtils {
|
public final class FeatureUtils {
|
||||||
|
|
||||||
|
private static final Pattern NOT_UNICODE_OR_IS_SPECIAL = Pattern.compile("[^\\p{L}|\\p{M}|\\s]|\\||\\p{InSpacing_Modifier_Letters}");
|
||||||
|
private static final Pattern ONE_OR_MORE_WHITESPACE = Pattern.compile("\\p{IsWhite_Space}+");
|
||||||
|
private static final Pattern TURKISH_I = Pattern.compile("\\u0130");
|
||||||
|
|
||||||
private FeatureUtils() {}
|
private FeatureUtils() {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -56,20 +61,18 @@ public final class FeatureUtils {
|
||||||
String newText = text.startsWith(" ") ? "" : " ";
|
String newText = text.startsWith(" ") ? "" : " ";
|
||||||
|
|
||||||
// 2. Replace punctuation and whitespace with ' '
|
// 2. Replace punctuation and whitespace with ' '
|
||||||
// NOTE: we capture unicode letters AND marks as Nepalese and other languages
|
|
||||||
newText += text.replaceAll("[^\\p{L}|\\p{M}|\\s]|\\|", " ");
|
|
||||||
|
|
||||||
// 2.1. Replace spacing modifier characters
|
// 2.1. Replace spacing modifier characters
|
||||||
newText = newText.replaceAll("\\p{InSpacing_Modifier_Letters}", " ");
|
// NOTE: we capture unicode letters AND marks as Nepalese and other languages
|
||||||
|
newText += NOT_UNICODE_OR_IS_SPECIAL.matcher(text).replaceAll(" ");
|
||||||
|
|
||||||
// 3. Add space at end
|
// 3. Add space at end
|
||||||
newText += text.endsWith(" ") ? "" : " ";
|
newText += text.endsWith(" ") ? "" : " ";
|
||||||
|
|
||||||
// 4. Remove multiple spaces (2 or more) with a single space
|
// 4. Remove multiple spaces (2 or more) with a single space
|
||||||
newText = newText.replaceAll("\\p{IsWhite_Space}+", " ");
|
newText = ONE_OR_MORE_WHITESPACE.matcher(newText).replaceAll(" ");
|
||||||
|
|
||||||
// 5. Replace Turkish İ with I (TODO - check this out better...)
|
// 5. Replace Turkish İ with I (TODO - check this out better...)
|
||||||
newText = newText.replaceAll("\\u0130", "I");
|
newText = TURKISH_I.matcher(newText).replaceAll("I");
|
||||||
|
|
||||||
return newText.toLowerCase(Locale.ROOT);
|
return newText.toLowerCase(Locale.ROOT);
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,13 +55,14 @@ public class NGramFeatureExtractor implements FeatureExtractor {
|
||||||
Map<String, Counter> charNGrams = new TreeMap<>();
|
Map<String, Counter> charNGrams = new TreeMap<>();
|
||||||
|
|
||||||
int countSum = 0;
|
int countSum = 0;
|
||||||
int end = newText.toString().length() - nGrams;
|
String textWithTerminators = newText.toString();
|
||||||
|
int end = textWithTerminators.length() - nGrams;
|
||||||
for (int start = 0; start <= end; ++start) {
|
for (int start = 0; start <= end; ++start) {
|
||||||
StringBuilder charNGram = new StringBuilder();
|
StringBuilder charNGram = new StringBuilder();
|
||||||
|
|
||||||
int index;
|
int index;
|
||||||
for (index = 0; index < nGrams; ++index) {
|
for (index = 0; index < nGrams; ++index) {
|
||||||
char currentChar = newText.toString().charAt(start + index);
|
char currentChar = textWithTerminators.charAt(start + index);
|
||||||
if (currentChar == ' ') {
|
if (currentChar == ' ') {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue