BAEL-5149 Remove accents from String in Java
This commit is contained in:
parent
da69f6f16b
commit
fa5d7221e9
@ -0,0 +1,49 @@
|
||||
package com.baeldung.accentsanddiacriticsremoval;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.text.Normalizer;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
class StringNormalizer {
|
||||
|
||||
static String removeAccentsWithApacheCommons(String input) {
|
||||
return StringUtils.stripAccents(input);
|
||||
}
|
||||
|
||||
static String removeAccents(String input) {
|
||||
return normalize(input).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
|
||||
}
|
||||
|
||||
static String unicodeValueOfNormalizedString(String input) {
|
||||
return toUnicode(normalize(input));
|
||||
}
|
||||
|
||||
private static String normalize(String input) {
|
||||
return input == null ? null : Normalizer.normalize(input, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
private static String toUnicode(String input) {
|
||||
if (input.length() == 1) {
|
||||
return toUnicode(input.charAt(0));
|
||||
} else {
|
||||
StringJoiner stringJoiner = new StringJoiner(" ");
|
||||
for (char c : input.toCharArray()) {
|
||||
stringJoiner.add(toUnicode(c));
|
||||
}
|
||||
return stringJoiner.toString();
|
||||
}
|
||||
}
|
||||
|
||||
private static String toUnicode(char input) {
|
||||
|
||||
String hex = Integer.toHexString(input);
|
||||
StringBuilder sb = new StringBuilder(hex);
|
||||
|
||||
while (sb.length() < 4) {
|
||||
sb.insert(0, "0");
|
||||
}
|
||||
sb.insert(0, "\\u");
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
@ -0,0 +1,44 @@
|
||||
package com.baeldung.accentsanddiacriticsremoval;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.baeldung.accentsanddiacriticsremoval.StringNormalizer;
|
||||
|
||||
class StringNormalizerUnitTest {
|
||||
|
||||
@Test
|
||||
void givenStringWithDecomposableUnicodeCharacters_whenRemoveAccents_thenReturnASCIIString() {
|
||||
assertEquals("aaaeiiiiggnnsssuuy", StringNormalizer.removeAccents("āăąēîïĩíĝġńñšŝśûůŷ"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void givenStringWithDecomposableUnicodeCharacters_whenRemoveAccentsWithApacheCommons_thenReturnASCIIString() {
|
||||
assertEquals("aaaeiiiiggnnsssuuy", StringNormalizer.removeAccentsWithApacheCommons("āăąēîïĩíĝġńñšŝśûůŷ"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void givenStringWithNondecomposableUnicodeCharacters_whenRemoveAccents_thenReturnOriginalString() {
|
||||
assertEquals("łđħœ", StringNormalizer.removeAccents("łđħœ"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void givenStringWithNondecomposableUnicodeCharacters_whenRemoveAccentsWithApacheCommons_thenReturnModifiedString() {
|
||||
assertEquals("lđħœ", StringNormalizer.removeAccentsWithApacheCommons("łđħœ"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void givenStringWithDecomposableUnicodeCharacters_whenUnicodeValueOfNormalizedString_thenReturnUnicodeValue() {
|
||||
assertEquals("\\u0061 \\u0304", StringNormalizer.unicodeValueOfNormalizedString("ā"));
|
||||
assertEquals("\\u0069 \\u0308", StringNormalizer.unicodeValueOfNormalizedString("ï"));
|
||||
assertEquals("\\u006e \\u0301", StringNormalizer.unicodeValueOfNormalizedString("ń"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void givenStringWithNonDecomposableUnicodeCharacters_whenUnicodeValueOfNormalizedString_thenReturnOriginalValue() {
|
||||
assertEquals("\\u0142", StringNormalizer.unicodeValueOfNormalizedString("ł"));
|
||||
assertEquals("\\u0127", StringNormalizer.unicodeValueOfNormalizedString("ħ"));
|
||||
assertEquals("\\u0111", StringNormalizer.unicodeValueOfNormalizedString("đ"));
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user