BAEL-5149 (#11241)
* Init * Removing mvnw files * Apply eclipse code format * Refactoring * Refactoring * BAEL-4211 Add benchmarks * Delete hexagonal directory * Refactoring based on the feedback * Refactoring based on feedback - package rename * Directory rename * BAEL-5149 Remove accents from String in Java * BAEL-5149 Remove accents from String in Java * Including suggestions after a review Co-authored-by: asia <joannakrzeklubowiecka@protonmail.com>
This commit is contained in:
parent
979db86a51
commit
2f183181d4
|
@ -0,0 +1,49 @@
|
|||
package com.baeldung.accentsanddiacriticsremoval;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.text.Normalizer;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
class StringNormalizer {
|
||||
|
||||
static String removeAccentsWithApacheCommons(String input) {
|
||||
return StringUtils.stripAccents(input);
|
||||
}
|
||||
|
||||
static String removeAccents(String input) {
|
||||
return normalize(input).replaceAll("\\p{M}", "");
|
||||
}
|
||||
|
||||
static String unicodeValueOfNormalizedString(String input) {
|
||||
return toUnicode(normalize(input));
|
||||
}
|
||||
|
||||
private static String normalize(String input) {
|
||||
return input == null ? null : Normalizer.normalize(input, Normalizer.Form.NFKD);
|
||||
}
|
||||
|
||||
private static String toUnicode(String input) {
|
||||
if (input.length() == 1) {
|
||||
return toUnicode(input.charAt(0));
|
||||
} else {
|
||||
StringJoiner stringJoiner = new StringJoiner(" ");
|
||||
for (char c : input.toCharArray()) {
|
||||
stringJoiner.add(toUnicode(c));
|
||||
}
|
||||
return stringJoiner.toString();
|
||||
}
|
||||
}
|
||||
|
||||
private static String toUnicode(char input) {
|
||||
|
||||
String hex = Integer.toHexString(input);
|
||||
StringBuilder sb = new StringBuilder(hex);
|
||||
|
||||
while (sb.length() < 4) {
|
||||
sb.insert(0, "0");
|
||||
}
|
||||
sb.insert(0, "\\u");
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
package com.baeldung.accentsanddiacriticsremoval;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
|
||||
import java.text.Collator;
|
||||
|
||||
import static java.lang.Character.*;
|
||||
import static java.lang.String.valueOf;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
public class CollatorUnitTest {
|
||||
|
||||
private final Collator collator = Collator.getInstance();
|
||||
|
||||
@Setup
|
||||
public void setup() {
|
||||
collator.setDecomposition(2);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void givenAccentedStringAndPrimaryCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() {
|
||||
Collator collator = Collator.getInstance();
|
||||
collator.setDecomposition(2);
|
||||
collator.setStrength(0);
|
||||
assertEquals(0, collator.compare("a", "a"));
|
||||
assertEquals(0, collator.compare("ä", "a"));
|
||||
assertEquals(0, collator.compare("A", "a"));
|
||||
assertEquals(1, collator.compare("b", "a"));
|
||||
assertEquals(0, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002))));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void givenAccentedStringAndSecondaryCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() {
|
||||
collator.setStrength(1);
|
||||
assertEquals(1, collator.compare("ä", "a"));
|
||||
assertEquals(1, collator.compare("b", "a"));
|
||||
assertEquals(0, collator.compare("A", "a"));
|
||||
assertEquals(0, collator.compare("a", "a"));
|
||||
assertEquals(0, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void givenAccentedStringAndTeriaryCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() {
|
||||
collator.setStrength(2);
|
||||
assertEquals(1, collator.compare("A", "a"));
|
||||
assertEquals(1, collator.compare("ä", "a"));
|
||||
assertEquals(1, collator.compare("b", "a"));
|
||||
assertEquals(0, collator.compare("a", "a"));
|
||||
assertEquals(0, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002))));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void givenAccentedStringAndIdenticalCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() {
|
||||
collator.setStrength(3);
|
||||
assertEquals(1, collator.compare("A", "a"));
|
||||
assertEquals(1, collator.compare("ä", "a"));
|
||||
assertEquals(1, collator.compare("b", "a"));
|
||||
assertEquals(-1, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002))));
|
||||
assertEquals(0, collator.compare("a", "a"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void givenNondecomposableAccentedStringAndIdenticalCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() {
|
||||
collator.setStrength(0);
|
||||
assertEquals(1, collator.compare("ł", "l"));
|
||||
assertEquals(1, collator.compare("ø", "o"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package com.baeldung.accentsanddiacriticsremoval;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.text.Normalizer;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class StringNormalizerUnitTest {
|
||||
|
||||
@Test
|
||||
public void givenNotNormalizedString_whenIsNormalized_thenReturnFalse() {
|
||||
assertFalse(Normalizer.isNormalized("āăąēîïĩíĝġńñšŝśûůŷ", Normalizer.Form.NFKD));
|
||||
}
|
||||
|
||||
@Test
|
||||
void givenStringWithDecomposableUnicodeCharacters_whenRemoveAccents_thenReturnASCIIString() {
|
||||
assertEquals("aaaeiiiiggnnsssuuy", StringNormalizer.removeAccents("āăąēîïĩíĝġńñšŝśûůŷ"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void givenStringWithDecomposableUnicodeCharacters_whenRemoveAccentsWithApacheCommons_thenReturnASCIIString() {
|
||||
assertEquals("aaaeiiiiggnnsssuuy", StringNormalizer.removeAccentsWithApacheCommons("āăąēîïĩíĝġńñšŝśûůŷ"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void givenStringWithNondecomposableUnicodeCharacters_whenRemoveAccents_thenReturnOriginalString() {
|
||||
assertEquals("łđħœ", StringNormalizer.removeAccents("łđħœ"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void givenStringWithNondecomposableUnicodeCharacters_whenRemoveAccentsWithApacheCommons_thenReturnModifiedString() {
|
||||
assertEquals("lđħœ", StringNormalizer.removeAccentsWithApacheCommons("łđħœ"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void givenStringWithDecomposableUnicodeCharacters_whenUnicodeValueOfNormalizedString_thenReturnUnicodeValue() {
|
||||
assertEquals("\\u0066 \\u0069", StringNormalizer.unicodeValueOfNormalizedString("fi"));
|
||||
assertEquals("\\u0061 \\u0304", StringNormalizer.unicodeValueOfNormalizedString("ā"));
|
||||
assertEquals("\\u0069 \\u0308", StringNormalizer.unicodeValueOfNormalizedString("ï"));
|
||||
assertEquals("\\u006e \\u0301", StringNormalizer.unicodeValueOfNormalizedString("ń"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void givenStringWithNonDecomposableUnicodeCharacters_whenUnicodeValueOfNormalizedString_thenReturnOriginalValue() {
|
||||
assertEquals("\\u0142", StringNormalizer.unicodeValueOfNormalizedString("ł"));
|
||||
assertEquals("\\u0127", StringNormalizer.unicodeValueOfNormalizedString("ħ"));
|
||||
assertEquals("\\u0111", StringNormalizer.unicodeValueOfNormalizedString("đ"));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue