BAEL-5149 (#11241)
* Init * Removing mvnw files * Apply eclipse code format * Refactoring * Refactoring * BAEL-4211 Add benchmarks * Delete hexagonal directory * Refactoring based on the feedback * Refactoring based on feedback - package rename * Directory rename * BAEL-5149 Remove accents from String in Java * BAEL-5149 Remove accents from String in Java * Including suggestions after a review Co-authored-by: asia <joannakrzeklubowiecka@protonmail.com>
This commit is contained in:
parent
979db86a51
commit
2f183181d4
|
@ -0,0 +1,49 @@
|
||||||
|
package com.baeldung.accentsanddiacriticsremoval;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import java.text.Normalizer;
|
||||||
|
import java.util.StringJoiner;
|
||||||
|
|
||||||
|
class StringNormalizer {
|
||||||
|
|
||||||
|
static String removeAccentsWithApacheCommons(String input) {
|
||||||
|
return StringUtils.stripAccents(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
static String removeAccents(String input) {
|
||||||
|
return normalize(input).replaceAll("\\p{M}", "");
|
||||||
|
}
|
||||||
|
|
||||||
|
static String unicodeValueOfNormalizedString(String input) {
|
||||||
|
return toUnicode(normalize(input));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String normalize(String input) {
|
||||||
|
return input == null ? null : Normalizer.normalize(input, Normalizer.Form.NFKD);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String toUnicode(String input) {
|
||||||
|
if (input.length() == 1) {
|
||||||
|
return toUnicode(input.charAt(0));
|
||||||
|
} else {
|
||||||
|
StringJoiner stringJoiner = new StringJoiner(" ");
|
||||||
|
for (char c : input.toCharArray()) {
|
||||||
|
stringJoiner.add(toUnicode(c));
|
||||||
|
}
|
||||||
|
return stringJoiner.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String toUnicode(char input) {
|
||||||
|
|
||||||
|
String hex = Integer.toHexString(input);
|
||||||
|
StringBuilder sb = new StringBuilder(hex);
|
||||||
|
|
||||||
|
while (sb.length() < 4) {
|
||||||
|
sb.insert(0, "0");
|
||||||
|
}
|
||||||
|
sb.insert(0, "\\u");
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,70 @@
|
||||||
|
package com.baeldung.accentsanddiacriticsremoval;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.openjdk.jmh.annotations.Setup;
|
||||||
|
|
||||||
|
import java.text.Collator;
|
||||||
|
|
||||||
|
import static java.lang.Character.*;
|
||||||
|
import static java.lang.String.valueOf;
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
|
public class CollatorUnitTest {
|
||||||
|
|
||||||
|
private final Collator collator = Collator.getInstance();
|
||||||
|
|
||||||
|
@Setup
|
||||||
|
public void setup() {
|
||||||
|
collator.setDecomposition(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void givenAccentedStringAndPrimaryCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() {
|
||||||
|
Collator collator = Collator.getInstance();
|
||||||
|
collator.setDecomposition(2);
|
||||||
|
collator.setStrength(0);
|
||||||
|
assertEquals(0, collator.compare("a", "a"));
|
||||||
|
assertEquals(0, collator.compare("ä", "a"));
|
||||||
|
assertEquals(0, collator.compare("A", "a"));
|
||||||
|
assertEquals(1, collator.compare("b", "a"));
|
||||||
|
assertEquals(0, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002))));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void givenAccentedStringAndSecondaryCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() {
|
||||||
|
collator.setStrength(1);
|
||||||
|
assertEquals(1, collator.compare("ä", "a"));
|
||||||
|
assertEquals(1, collator.compare("b", "a"));
|
||||||
|
assertEquals(0, collator.compare("A", "a"));
|
||||||
|
assertEquals(0, collator.compare("a", "a"));
|
||||||
|
assertEquals(0, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002))));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void givenAccentedStringAndTeriaryCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() {
|
||||||
|
collator.setStrength(2);
|
||||||
|
assertEquals(1, collator.compare("A", "a"));
|
||||||
|
assertEquals(1, collator.compare("ä", "a"));
|
||||||
|
assertEquals(1, collator.compare("b", "a"));
|
||||||
|
assertEquals(0, collator.compare("a", "a"));
|
||||||
|
assertEquals(0, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002))));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void givenAccentedStringAndIdenticalCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() {
|
||||||
|
collator.setStrength(3);
|
||||||
|
assertEquals(1, collator.compare("A", "a"));
|
||||||
|
assertEquals(1, collator.compare("ä", "a"));
|
||||||
|
assertEquals(1, collator.compare("b", "a"));
|
||||||
|
assertEquals(-1, collator.compare(valueOf(toChars(0x0001)), valueOf(toChars(0x0002))));
|
||||||
|
assertEquals(0, collator.compare("a", "a"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void givenNondecomposableAccentedStringAndIdenticalCollatorStrength_whenCompareWithASCIIString_thenReturnTrue() {
|
||||||
|
collator.setStrength(0);
|
||||||
|
assertEquals(1, collator.compare("ł", "l"));
|
||||||
|
assertEquals(1, collator.compare("ø", "o"));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,51 @@
|
||||||
|
package com.baeldung.accentsanddiacriticsremoval;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
import java.text.Normalizer;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
class StringNormalizerUnitTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void givenNotNormalizedString_whenIsNormalized_thenReturnFalse() {
|
||||||
|
assertFalse(Normalizer.isNormalized("āăąēîïĩíĝġńñšŝśûůŷ", Normalizer.Form.NFKD));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void givenStringWithDecomposableUnicodeCharacters_whenRemoveAccents_thenReturnASCIIString() {
|
||||||
|
assertEquals("aaaeiiiiggnnsssuuy", StringNormalizer.removeAccents("āăąēîïĩíĝġńñšŝśûůŷ"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void givenStringWithDecomposableUnicodeCharacters_whenRemoveAccentsWithApacheCommons_thenReturnASCIIString() {
|
||||||
|
assertEquals("aaaeiiiiggnnsssuuy", StringNormalizer.removeAccentsWithApacheCommons("āăąēîïĩíĝġńñšŝśûůŷ"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void givenStringWithNondecomposableUnicodeCharacters_whenRemoveAccents_thenReturnOriginalString() {
|
||||||
|
assertEquals("łđħœ", StringNormalizer.removeAccents("łđħœ"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void givenStringWithNondecomposableUnicodeCharacters_whenRemoveAccentsWithApacheCommons_thenReturnModifiedString() {
|
||||||
|
assertEquals("lđħœ", StringNormalizer.removeAccentsWithApacheCommons("łđħœ"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void givenStringWithDecomposableUnicodeCharacters_whenUnicodeValueOfNormalizedString_thenReturnUnicodeValue() {
|
||||||
|
assertEquals("\\u0066 \\u0069", StringNormalizer.unicodeValueOfNormalizedString("fi"));
|
||||||
|
assertEquals("\\u0061 \\u0304", StringNormalizer.unicodeValueOfNormalizedString("ā"));
|
||||||
|
assertEquals("\\u0069 \\u0308", StringNormalizer.unicodeValueOfNormalizedString("ï"));
|
||||||
|
assertEquals("\\u006e \\u0301", StringNormalizer.unicodeValueOfNormalizedString("ń"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void givenStringWithNonDecomposableUnicodeCharacters_whenUnicodeValueOfNormalizedString_thenReturnOriginalValue() {
|
||||||
|
assertEquals("\\u0142", StringNormalizer.unicodeValueOfNormalizedString("ł"));
|
||||||
|
assertEquals("\\u0127", StringNormalizer.unicodeValueOfNormalizedString("ħ"));
|
||||||
|
assertEquals("\\u0111", StringNormalizer.unicodeValueOfNormalizedString("đ"));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue