BAEL-6967-decode-unicode-in-string (#14901)

* BAEL-6967-decode-unicode-in-string

* update unit test

---------

Co-authored-by: tienvn <tienvn@>
This commit is contained in:
vunamtien 2023-10-07 20:04:59 +07:00 committed by GitHub
parent e62aba145b
commit 3a816d6b3d
3 changed files with 74 additions and 0 deletions

View File

@ -28,12 +28,18 @@
<artifactId>commons-vfs2</artifactId>
<version>${commons-vfs2.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
<version>${apache-commons-text.version}</version>
</dependency>
</dependencies>
<properties>
<commons-compress.version>1.23.0</commons-compress.version>
<ant.version>1.10.13</ant.version>
<commons-vfs2.version>2.9.0</commons-vfs2.version>
<apache-commons-text.version>1.10.0</apache-commons-text.version>
</properties>
</project>

View File

@ -0,0 +1,29 @@
package com.baeldung.commons.convertunicode;
import org.apache.commons.text.StringEscapeUtils;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class UnicodeConverterUtil {
public static String decodeWithApacheCommons(String input) {
return StringEscapeUtils.unescapeJava(input);
}
public static String decodeWithPlainJava(String input) {
Pattern pattern = Pattern.compile("\\\\u[0-9a-fA-F]{4}");
Matcher matcher = pattern.matcher(input);
StringBuilder decodedString = new StringBuilder();
while (matcher.find()) {
String unicodeSequence = matcher.group();
char unicodeChar = (char) Integer.parseInt(unicodeSequence.substring(2), 16);
matcher.appendReplacement(decodedString, Character.toString(unicodeChar));
}
matcher.appendTail(decodedString);
return decodedString.toString();
}
}

View File

@ -0,0 +1,39 @@
package com.baeldung.commons.convertunicode;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class UnicodeConverterUnitTest {
@Test
public void whenInputHaveUnicodeSequences_ThenDecode() {
String encodedString = "\\u0048\\u0065\\u006C\\u006C\\u006F World";
String expectedDecodedString = "Hello World";
assertEquals(expectedDecodedString, UnicodeConverterUtil.decodeWithApacheCommons(encodedString));
assertEquals(expectedDecodedString, UnicodeConverterUtil.decodeWithPlainJava(encodedString));
}
@Test
public void whenInputHaveNoUnicodeSequences_ThenDoNothing() {
String inputString = "Hello World";
assertEquals(inputString, UnicodeConverterUtil.decodeWithApacheCommons(inputString));
assertEquals(inputString, UnicodeConverterUtil.decodeWithPlainJava(inputString));
}
@Test
public void whenInputHaveUnicodeSequencesInMiddle_ThenDecode() {
String encodedString = "This is a test \\u0069\\u006E the middle.";
String expectedDecodedString = "This is a test in the middle.";
assertEquals(expectedDecodedString, UnicodeConverterUtil.decodeWithApacheCommons(encodedString));
assertEquals(expectedDecodedString, UnicodeConverterUtil.decodeWithPlainJava(encodedString));
}
@Test
public void whenInputHaveMultipleUnicodeSequences_ThenDecode() {
String encodedString = "Unicode: \\u0048\\u0065\\u006C\\u006C\\u006F \\u0057\\u006F\\u0072\\u006C\\u0064";
String expectedDecodedString = "Unicode: Hello World";
assertEquals(expectedDecodedString, UnicodeConverterUtil.decodeWithApacheCommons(encodedString));
assertEquals(expectedDecodedString, UnicodeConverterUtil.decodeWithPlainJava(encodedString));
}
}