BAEL-5149 Remove accents from String in Java

This commit is contained in:
asia 2021-09-20 12:32:52 +02:00
parent da69f6f16b
commit fa5d7221e9
2 changed files with 93 additions and 0 deletions

View File

@ -0,0 +1,49 @@
package com.baeldung.accentsanddiacriticsremoval;
import org.apache.commons.lang3.StringUtils;
import java.text.Normalizer;
import java.util.StringJoiner;
class StringNormalizer {
static String removeAccentsWithApacheCommons(String input) {
return StringUtils.stripAccents(input);
}
static String removeAccents(String input) {
return normalize(input).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
}
static String unicodeValueOfNormalizedString(String input) {
return toUnicode(normalize(input));
}
private static String normalize(String input) {
return input == null ? null : Normalizer.normalize(input, Normalizer.Form.NFD);
}
private static String toUnicode(String input) {
if (input.length() == 1) {
return toUnicode(input.charAt(0));
} else {
StringJoiner stringJoiner = new StringJoiner(" ");
for (char c : input.toCharArray()) {
stringJoiner.add(toUnicode(c));
}
return stringJoiner.toString();
}
}
private static String toUnicode(char input) {
String hex = Integer.toHexString(input);
StringBuilder sb = new StringBuilder(hex);
while (sb.length() < 4) {
sb.insert(0, "0");
}
sb.insert(0, "\\u");
return sb.toString();
}
}

View File

@ -0,0 +1,44 @@
package com.baeldung.accentsanddiacriticsremoval;
import static org.junit.jupiter.api.Assertions.assertEquals;
import org.junit.jupiter.api.Test;
import com.baeldung.accentsanddiacriticsremoval.StringNormalizer;
class StringNormalizerUnitTest {
@Test
void givenStringWithDecomposableUnicodeCharacters_whenRemoveAccents_thenReturnASCIIString() {
assertEquals("aaaeiiiiggnnsssuuy", StringNormalizer.removeAccents("āăąēîïĩíĝġńñšŝśûůŷ"));
}
@Test
void givenStringWithDecomposableUnicodeCharacters_whenRemoveAccentsWithApacheCommons_thenReturnASCIIString() {
assertEquals("aaaeiiiiggnnsssuuy", StringNormalizer.removeAccentsWithApacheCommons("āăąēîïĩíĝġńñšŝśûůŷ"));
}
@Test
void givenStringWithNondecomposableUnicodeCharacters_whenRemoveAccents_thenReturnOriginalString() {
assertEquals("łđħœ", StringNormalizer.removeAccents("łđħœ"));
}
@Test
void givenStringWithNondecomposableUnicodeCharacters_whenRemoveAccentsWithApacheCommons_thenReturnModifiedString() {
assertEquals("lđħœ", StringNormalizer.removeAccentsWithApacheCommons("łđħœ"));
}
@Test
void givenStringWithDecomposableUnicodeCharacters_whenUnicodeValueOfNormalizedString_thenReturnUnicodeValue() {
assertEquals("\\u0061 \\u0304", StringNormalizer.unicodeValueOfNormalizedString("ā"));
assertEquals("\\u0069 \\u0308", StringNormalizer.unicodeValueOfNormalizedString("ï"));
assertEquals("\\u006e \\u0301", StringNormalizer.unicodeValueOfNormalizedString("ń"));
}
@Test
void givenStringWithNonDecomposableUnicodeCharacters_whenUnicodeValueOfNormalizedString_thenReturnOriginalValue() {
assertEquals("\\u0142", StringNormalizer.unicodeValueOfNormalizedString("ł"));
assertEquals("\\u0127", StringNormalizer.unicodeValueOfNormalizedString("ħ"));
assertEquals("\\u0111", StringNormalizer.unicodeValueOfNormalizedString("đ"));
}
}