diff --git a/core-java-modules/core-java-string-operations-3/src/main/java/com/baeldung/accentsanddiacriticsremoval/StringNormalizer.java b/core-java-modules/core-java-string-operations-3/src/main/java/com/baeldung/accentsanddiacriticsremoval/StringNormalizer.java new file mode 100644 index 0000000000..d33b9178ea --- /dev/null +++ b/core-java-modules/core-java-string-operations-3/src/main/java/com/baeldung/accentsanddiacriticsremoval/StringNormalizer.java @@ -0,0 +1,49 @@ +package com.baeldung.accentsanddiacriticsremoval; + +import org.apache.commons.lang3.StringUtils; + +import java.text.Normalizer; +import java.util.StringJoiner; + +class StringNormalizer { + + static String removeAccentsWithApacheCommons(String input) { + return StringUtils.stripAccents(input); + } + + static String removeAccents(String input) { + return normalize(input).replaceAll("\\p{M}", ""); + } + + static String unicodeValueOfNormalizedString(String input) { + return toUnicode(normalize(input)); + } + + private static String normalize(String input) { + return input == null ? null : Normalizer.normalize(input, Normalizer.Form.NFKD); + } + + private static String toUnicode(String input) { + if (input.length() == 1) { + return toUnicode(input.charAt(0)); + } else { + StringJoiner stringJoiner = new StringJoiner(" "); + for (char c : input.toCharArray()) { + stringJoiner.add(toUnicode(c)); + } + return stringJoiner.toString(); + } + } + + private static String toUnicode(char input) { + + String hex = Integer.toHexString(input); + StringBuilder sb = new StringBuilder(hex); + + while (sb.length() < 4) { + sb.insert(0, "0"); + } + sb.insert(0, "\\u"); + return sb.toString(); + } +} diff --git a/core-java-modules/core-java-string-operations-3/src/test/java/com/baeldung/accentsanddiacriticsremoval/CollatorUnitTest.java b/core-java-modules/core-java-string-operations-3/src/test/java/com/baeldung/accentsanddiacriticsremoval/CollatorUnitTest.java new file mode 100644 index 0000000000..93b4f5af2e --- /dev/null +++ b/core-java-modules/core-java-string-operations-3/src/test/java/com/baeldung/accentsanddiacriticsremoval/CollatorUnitTest.java @@ -0,0 +1,70 @@ +package com.baeldung.accentsanddiacriticsremoval; + +import org.junit.Test; +import org.openjdk.jmh.annotations.Setup; + +import java.text.Collator; + +import static java.lang.Character.*; +import static java.lang.String.valueOf; +import static org.junit.Assert.assertEquals; + +public class CollatorUnitTest { + + private final Collator collator = Collator.getInstance(); + + @Setup + public void setup() { + collator.setDecomposition(2); + } + + @Test + public void givenAccentedStringAndPrimaryCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() { + Collator collator = Collator.getInstance(); + collator.setDecomposition(2); + collator.setStrength(0); + assertEquals(0, collator.compare("a", "a")); + assertEquals(0, collator.compare("ä", "a")); + assertEquals(0, collator.compare("A", "a")); + assertEquals(1, collator.compare("b", "a")); + assertEquals(0, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002)))); + } + + @Test + public void givenAccentedStringAndSecondaryCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() { + collator.setStrength(1); + assertEquals(1, collator.compare("ä", "a")); + assertEquals(1, collator.compare("b", "a")); + assertEquals(0, collator.compare("A", "a")); + assertEquals(0, collator.compare("a", "a")); + assertEquals(0, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002)))); + + } + + @Test + public void givenAccentedStringAndTeriaryCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() { + collator.setStrength(2); + assertEquals(1, collator.compare("A", "a")); + assertEquals(1, collator.compare("ä", "a")); + assertEquals(1, collator.compare("b", "a")); + assertEquals(0, collator.compare("a", "a")); + assertEquals(0, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002)))); + } + + @Test + public void givenAccentedStringAndIdenticalCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() { + collator.setStrength(3); + assertEquals(1, collator.compare("A", "a")); + assertEquals(1, collator.compare("ä", "a")); + assertEquals(1, collator.compare("b", "a")); + assertEquals(-1, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002)))); + assertEquals(0, collator.compare("a", "a")); + } + + @Test + public void givenNondecomposableAccentedStringAndIdenticalCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() { + collator.setStrength(0); + assertEquals(1, collator.compare("ł", "l")); + assertEquals(1, collator.compare("ø", "o")); + } +} diff --git a/core-java-modules/core-java-string-operations-3/src/test/java/com/baeldung/accentsanddiacriticsremoval/StringNormalizerUnitTest.java b/core-java-modules/core-java-string-operations-3/src/test/java/com/baeldung/accentsanddiacriticsremoval/StringNormalizerUnitTest.java new file mode 100644 index 0000000000..74359726b7 --- /dev/null +++ b/core-java-modules/core-java-string-operations-3/src/test/java/com/baeldung/accentsanddiacriticsremoval/StringNormalizerUnitTest.java @@ -0,0 +1,51 @@ +package com.baeldung.accentsanddiacriticsremoval; + +import static org.junit.Assert.assertFalse; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.text.Normalizer; + +import org.junit.jupiter.api.Test; + +class StringNormalizerUnitTest { + + @Test + public void givenNotNormalizedString_whenIsNormalized_thenReturnFalse() { + assertFalse(Normalizer.isNormalized("āăąēîïĩíĝġńñšŝśûůŷ", Normalizer.Form.NFKD)); + } + + @Test + void givenStringWithDecomposableUnicodeCharacters_whenRemoveAccents_thenReturnASCIIString() { + assertEquals("aaaeiiiiggnnsssuuy", StringNormalizer.removeAccents("āăąēîïĩíĝġńñšŝśûůŷ")); + } + + @Test + void givenStringWithDecomposableUnicodeCharacters_whenRemoveAccentsWithApacheCommons_thenReturnASCIIString() { + assertEquals("aaaeiiiiggnnsssuuy", StringNormalizer.removeAccentsWithApacheCommons("āăąēîïĩíĝġńñšŝśûůŷ")); + } + + @Test + void givenStringWithNondecomposableUnicodeCharacters_whenRemoveAccents_thenReturnOriginalString() { + assertEquals("łđħœ", StringNormalizer.removeAccents("łđħœ")); + } + + @Test + void givenStringWithNondecomposableUnicodeCharacters_whenRemoveAccentsWithApacheCommons_thenReturnModifiedString() { + assertEquals("lđħœ", StringNormalizer.removeAccentsWithApacheCommons("łđħœ")); + } + + @Test + void givenStringWithDecomposableUnicodeCharacters_whenUnicodeValueOfNormalizedString_thenReturnUnicodeValue() { + assertEquals("\\u0066 \\u0069", StringNormalizer.unicodeValueOfNormalizedString("fi")); + assertEquals("\\u0061 \\u0304", StringNormalizer.unicodeValueOfNormalizedString("ā")); + assertEquals("\\u0069 \\u0308", StringNormalizer.unicodeValueOfNormalizedString("ï")); + assertEquals("\\u006e \\u0301", StringNormalizer.unicodeValueOfNormalizedString("ń")); + } + + @Test + void givenStringWithNonDecomposableUnicodeCharacters_whenUnicodeValueOfNormalizedString_thenReturnOriginalValue() { + assertEquals("\\u0142", StringNormalizer.unicodeValueOfNormalizedString("ł")); + assertEquals("\\u0127", StringNormalizer.unicodeValueOfNormalizedString("ħ")); + assertEquals("\\u0111", StringNormalizer.unicodeValueOfNormalizedString("đ")); + } +} \ No newline at end of file