StringUtils.stripAccents() should handle ligatures, UTF32 math blocks, etc. (#1201)
* Modified stripAccents to handle ligatures, UTF32 math blocks, etc. * changed StringUtils.isEmpty to isEmpty. * stripAccents Javadoc reference to Unicode Normalization Chart.
This commit is contained in:
parent
03abd8fab0
commit
fc014c23fe
|
@ -8119,7 +8119,8 @@ public class StringUtils {
|
||||||
/**
|
/**
|
||||||
* Removes diacritics (~= accents) from a string. The case will not be altered.
|
* Removes diacritics (~= accents) from a string. The case will not be altered.
|
||||||
* <p>For instance, 'à' will be replaced by 'a'.</p>
|
* <p>For instance, 'à' will be replaced by 'a'.</p>
|
||||||
* <p>Note that ligatures will be left as is.</p>
|
* <p>Decomposes ligatures and digraphs per the KD column in the
|
||||||
|
* <a href = "https://www.unicode.org/charts/normalization/">Unicode Normalization Chart.</a></p>
|
||||||
*
|
*
|
||||||
* <pre>
|
* <pre>
|
||||||
* StringUtils.stripAccents(null) = null
|
* StringUtils.stripAccents(null) = null
|
||||||
|
@ -8135,12 +8136,11 @@ public class StringUtils {
|
||||||
*/
|
*/
|
||||||
// See also Lucene's ASCIIFoldingFilter (Lucene 2.9) that replaces accented characters by their unaccented equivalent (and uncommitted bug fix: https://issues.apache.org/jira/browse/LUCENE-1343?focusedCommentId=12858907&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#action_12858907).
|
// See also Lucene's ASCIIFoldingFilter (Lucene 2.9) that replaces accented characters by their unaccented equivalent (and uncommitted bug fix: https://issues.apache.org/jira/browse/LUCENE-1343?focusedCommentId=12858907&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#action_12858907).
|
||||||
public static String stripAccents(final String input) {
|
public static String stripAccents(final String input) {
|
||||||
if (input == null) {
|
if (isEmpty(input)) {
|
||||||
return null;
|
return input;
|
||||||
}
|
}
|
||||||
final StringBuilder decomposed = new StringBuilder(Normalizer.normalize(input, Normalizer.Form.NFD));
|
final StringBuilder decomposed = new StringBuilder(Normalizer.normalize(input, Normalizer.Form.NFKD));
|
||||||
convertRemainingAccentCharacters(decomposed);
|
convertRemainingAccentCharacters(decomposed);
|
||||||
// Note that this doesn't correctly remove ligatures...
|
|
||||||
return STRIP_ACCENTS_PATTERN.matcher(decomposed).replaceAll(EMPTY);
|
return STRIP_ACCENTS_PATTERN.matcher(decomposed).replaceAll(EMPTY);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -89,14 +89,28 @@ public class StringUtilsTrimStripTest extends AbstractLangTest {
|
||||||
assertEquals("eclair", StringUtils.stripAccents("\u00E9clair"), "Failed to handle easy example");
|
assertEquals("eclair", StringUtils.stripAccents("\u00E9clair"), "Failed to handle easy example");
|
||||||
assertEquals("ALOSZZCN aloszzcn", StringUtils.stripAccents("\u0104\u0141\u00D3\u015A\u017B\u0179\u0106\u0143 "
|
assertEquals("ALOSZZCN aloszzcn", StringUtils.stripAccents("\u0104\u0141\u00D3\u015A\u017B\u0179\u0106\u0143 "
|
||||||
+ "\u0105\u0142\u00F3\u015B\u017C\u017A\u0107\u0144"));
|
+ "\u0105\u0142\u00F3\u015B\u017C\u017A\u0107\u0144"));
|
||||||
|
assertEquals("The cafe\u2019s pinata gave me deja vu.", StringUtils
|
||||||
|
.stripAccents("The caf\u00e9\u2019s pi\u00f1ata gave me d\u00e9j\u00e0 vu."),
|
||||||
|
"Failed to handle accented text");
|
||||||
|
assertEquals("fluid quest", StringUtils.stripAccents("\ufb02uid que\ufb06"), "Failed to handle ligatures");
|
||||||
|
assertEquals("a b c 1 2 3", StringUtils
|
||||||
|
.stripAccents("\u1d43 \u1d47 \u1d9c \u00b9 \u00b2 \u00b3"), "Failed to handle superscript text");
|
||||||
|
assertEquals("math italic", StringUtils
|
||||||
|
.stripAccents("\uD835\uDC5A\uD835\uDC4E\uD835\uDC61\u210E " +
|
||||||
|
"\uD835\uDC56\uD835\uDC61\uD835\uDC4E\uD835\uDC59\uD835\uDC56\uD835\uDC50"),
|
||||||
|
"Failed to handle UTF32 example");
|
||||||
|
assertEquals("\uD83D\uDF01 \uD83D\uDF02 \uD83D\uDF03 \uD83D\uDF04", StringUtils
|
||||||
|
.stripAccents("\uD83D\uDF01 \uD83D\uDF02 \uD83D\uDF03 \uD83D\uDF04"),
|
||||||
|
"Failed to handle non-accented text");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Disabled
|
@Disabled
|
||||||
public void testStripAccents_Korean() {
|
public void testStripAccents_Korean() {
|
||||||
// LANG-1655
|
// LANG-1655
|
||||||
final String input = "잊지마 넌 흐린 어둠사이 왼손으로 그린 별 하나";
|
final String input = "\uC78A\uC9C0\uB9C8 \uB10C \uD750\uB9B0 \uC5B4\uB460\uC0AC\uC774 " +
|
||||||
assertEquals(input, StringUtils.stripAccents(input), "Failed to handle non-accented text");
|
"\uC67C\uC190\uC73C\uB85C \uADF8\uB9B0 \uBCC4 \uD558\uB098";
|
||||||
|
assertEquals(input, StringUtils.stripAccents(input), "Failed to handle Korean text");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue