* Init

* Removing mvnw files

* Apply eclipse code format

* Refactoring

* Refactoring

* BAEL-4211 Add benchmarks

* Delete hexagonal directory

* Refactoring based on the feedback

* Refactoring based on feedback - package rename

* Directory rename

* BAEL-5149 Remove accents from String in Java

* BAEL-5149 Remove accents from String in Java

* Including suggestions after a review

Co-authored-by: asia <joannakrzeklubowiecka@protonmail.com>
This commit is contained in:
JoannaaKL 2021-10-17 08:36:43 +02:00 committed by GitHub
parent 979db86a51
commit 2f183181d4
3 changed files with 170 additions and 0 deletions

View File

@ -0,0 +1,49 @@
package com.baeldung.accentsanddiacriticsremoval;
import org.apache.commons.lang3.StringUtils;
import java.text.Normalizer;
import java.util.StringJoiner;
class StringNormalizer {
static String removeAccentsWithApacheCommons(String input) {
return StringUtils.stripAccents(input);
}
static String removeAccents(String input) {
return normalize(input).replaceAll("\\p{M}", "");
}
static String unicodeValueOfNormalizedString(String input) {
return toUnicode(normalize(input));
}
private static String normalize(String input) {
return input == null ? null : Normalizer.normalize(input, Normalizer.Form.NFKD);
}
private static String toUnicode(String input) {
if (input.length() == 1) {
return toUnicode(input.charAt(0));
} else {
StringJoiner stringJoiner = new StringJoiner(" ");
for (char c : input.toCharArray()) {
stringJoiner.add(toUnicode(c));
}
return stringJoiner.toString();
}
}
private static String toUnicode(char input) {
String hex = Integer.toHexString(input);
StringBuilder sb = new StringBuilder(hex);
while (sb.length() < 4) {
sb.insert(0, "0");
}
sb.insert(0, "\\u");
return sb.toString();
}
}

View File

@ -0,0 +1,70 @@
package com.baeldung.accentsanddiacriticsremoval;
import org.junit.Test;
import org.openjdk.jmh.annotations.Setup;
import java.text.Collator;
import static java.lang.Character.*;
import static java.lang.String.valueOf;
import static org.junit.Assert.assertEquals;
public class CollatorUnitTest {
private final Collator collator = Collator.getInstance();
@Setup
public void setup() {
collator.setDecomposition(2);
}
@Test
public void givenAccentedStringAndPrimaryCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() {
Collator collator = Collator.getInstance();
collator.setDecomposition(2);
collator.setStrength(0);
assertEquals(0, collator.compare("a", "a"));
assertEquals(0, collator.compare("ä", "a"));
assertEquals(0, collator.compare("A", "a"));
assertEquals(1, collator.compare("b", "a"));
assertEquals(0, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002))));
}
@Test
public void givenAccentedStringAndSecondaryCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() {
collator.setStrength(1);
assertEquals(1, collator.compare("ä", "a"));
assertEquals(1, collator.compare("b", "a"));
assertEquals(0, collator.compare("A", "a"));
assertEquals(0, collator.compare("a", "a"));
assertEquals(0, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002))));
}
@Test
public void givenAccentedStringAndTeriaryCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() {
collator.setStrength(2);
assertEquals(1, collator.compare("A", "a"));
assertEquals(1, collator.compare("ä", "a"));
assertEquals(1, collator.compare("b", "a"));
assertEquals(0, collator.compare("a", "a"));
assertEquals(0, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002))));
}
@Test
public void givenAccentedStringAndIdenticalCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() {
collator.setStrength(3);
assertEquals(1, collator.compare("A", "a"));
assertEquals(1, collator.compare("ä", "a"));
assertEquals(1, collator.compare("b", "a"));
assertEquals(-1, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002))));
assertEquals(0, collator.compare("a", "a"));
}
@Test
public void givenNondecomposableAccentedStringAndIdenticalCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() {
collator.setStrength(0);
assertEquals(1, collator.compare("ł", "l"));
assertEquals(1, collator.compare("ø", "o"));
}
}

View File

@ -0,0 +1,51 @@
package com.baeldung.accentsanddiacriticsremoval;
import static org.junit.Assert.assertFalse;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.text.Normalizer;
import org.junit.jupiter.api.Test;
class StringNormalizerUnitTest {
@Test
public void givenNotNormalizedString_whenIsNormalized_thenReturnFalse() {
assertFalse(Normalizer.isNormalized("āăąēîïĩíĝġńñšŝśûůŷ", Normalizer.Form.NFKD));
}
@Test
void givenStringWithDecomposableUnicodeCharacters_whenRemoveAccents_thenReturnASCIIString() {
assertEquals("aaaeiiiiggnnsssuuy", StringNormalizer.removeAccents("āăąēîïĩíĝġńñšŝśûůŷ"));
}
@Test
void givenStringWithDecomposableUnicodeCharacters_whenRemoveAccentsWithApacheCommons_thenReturnASCIIString() {
assertEquals("aaaeiiiiggnnsssuuy", StringNormalizer.removeAccentsWithApacheCommons("āăąēîïĩíĝġńñšŝśûůŷ"));
}
@Test
void givenStringWithNondecomposableUnicodeCharacters_whenRemoveAccents_thenReturnOriginalString() {
assertEquals("łđħœ", StringNormalizer.removeAccents("łđħœ"));
}
@Test
void givenStringWithNondecomposableUnicodeCharacters_whenRemoveAccentsWithApacheCommons_thenReturnModifiedString() {
assertEquals("lđħœ", StringNormalizer.removeAccentsWithApacheCommons("łđħœ"));
}
@Test
void givenStringWithDecomposableUnicodeCharacters_whenUnicodeValueOfNormalizedString_thenReturnUnicodeValue() {
assertEquals("\\u0066 \\u0069", StringNormalizer.unicodeValueOfNormalizedString(""));
assertEquals("\\u0061 \\u0304", StringNormalizer.unicodeValueOfNormalizedString("ā"));
assertEquals("\\u0069 \\u0308", StringNormalizer.unicodeValueOfNormalizedString("ï"));
assertEquals("\\u006e \\u0301", StringNormalizer.unicodeValueOfNormalizedString("ń"));
}
@Test
void givenStringWithNonDecomposableUnicodeCharacters_whenUnicodeValueOfNormalizedString_thenReturnOriginalValue() {
assertEquals("\\u0142", StringNormalizer.unicodeValueOfNormalizedString("ł"));
assertEquals("\\u0127", StringNormalizer.unicodeValueOfNormalizedString("ħ"));
assertEquals("\\u0111", StringNormalizer.unicodeValueOfNormalizedString("đ"));
}
}