BAEL-7175: UTF-8 Validation in Java (#15428)
This commit is contained in:
parent
ca584307b2
commit
56a4afe866
|
@ -24,6 +24,21 @@
|
|||
<artifactId>commons-text</artifactId>
|
||||
<version>${commons-text.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-core</artifactId>
|
||||
<version>${apache.tika.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-parsers-standard-package</artifactId>
|
||||
<version>${apache.tika.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.ibm.icu</groupId>
|
||||
<artifactId>icu4j</artifactId>
|
||||
<version>${icu4j.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter</artifactId>
|
||||
|
@ -60,7 +75,9 @@
|
|||
<maven.compiler.source>11</maven.compiler.source>
|
||||
<maven.compiler.target>11</maven.compiler.target>
|
||||
<apache.commons.lang3.version>3.13.0</apache.commons.lang3.version>
|
||||
<apache.tika.version>2.9.1</apache.tika.version>
|
||||
<commons-text.version>1.10.0</commons-text.version>
|
||||
<icu4j.version>74.1</icu4j.version>
|
||||
<liquibase.core.version>4.25.0</liquibase.core.version>
|
||||
</properties>
|
||||
|
||||
|
|
|
@ -0,0 +1,108 @@
|
|||
package com.baeldung.utf8validation;
|
||||
|
||||
import com.ibm.icu.text.CharsetDetector;
|
||||
import com.ibm.icu.text.CharsetMatch;
|
||||
import org.apache.tika.detect.EncodingDetector;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.txt.UniversalEncodingDetector;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.*;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class UTF8ValidationUnitTest {
|
||||
|
||||
private static final String UTF8_STRING = "Hello 你好";
|
||||
|
||||
private static final byte[] UTF8_BYTES = UTF8_STRING.getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
private static final byte[] INVALID_UTF8_BYTES = {(byte) 0xF0, (byte) 0xC1, (byte) 0x8C, (byte) 0xBC, (byte) 0xD1};
|
||||
|
||||
private static final InputStream ENGLISH_INPUTSTREAM = new ByteArrayInputStream("Hello".getBytes(StandardCharsets.UTF_8));
|
||||
|
||||
private static final InputStream UTF8_INPUTSTREAM = new ByteArrayInputStream(UTF8_BYTES);
|
||||
|
||||
private static final InputStream INVALID_UTF8_INPUTSTREAM = new ByteArrayInputStream(INVALID_UTF8_BYTES);
|
||||
|
||||
@Test
|
||||
void whenConvertValidUTF8BytesToString_thenReturnExpectedString() {
|
||||
String decodedStr = new String(UTF8_BYTES, StandardCharsets.UTF_8);
|
||||
assertEquals(UTF8_STRING, decodedStr);
|
||||
}
|
||||
|
||||
@Test
|
||||
void whenConvertInvalidUTF8BytesToString_thenReturnReplacementCharacters() {
|
||||
String decodedStr = new String(INVALID_UTF8_BYTES, StandardCharsets.UTF_8);
|
||||
assertEquals("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", decodedStr);
|
||||
}
|
||||
|
||||
@Test
|
||||
void whenDecodeValidUTF8Bytes_thenSucceeds() throws CharacterCodingException {
|
||||
|
||||
CharsetDecoder charsetDecoder = StandardCharsets.UTF_8.newDecoder();
|
||||
CharBuffer decodedCharBuffer = charsetDecoder.decode(java.nio.ByteBuffer.wrap(UTF8_BYTES));
|
||||
assertEquals(UTF8_STRING, decodedCharBuffer.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
void whenDecodeInvalidUTF8Bytes_thenThrowsMalformedInputException() {
|
||||
|
||||
CharsetDecoder charsetDecoder = StandardCharsets.UTF_8.newDecoder();
|
||||
assertThrows(MalformedInputException.class,() -> {charsetDecoder.decode(java.nio.ByteBuffer.wrap(INVALID_UTF8_BYTES));});
|
||||
}
|
||||
|
||||
@Test
|
||||
void whenValidateValidInputStreamByTika_thenReturnsUTF8() throws IOException {
|
||||
|
||||
EncodingDetector encodingDetector = new UniversalEncodingDetector();
|
||||
Charset detectedCharset = encodingDetector.detect(UTF8_INPUTSTREAM, new Metadata());
|
||||
assertEquals(StandardCharsets.UTF_8, detectedCharset);
|
||||
}
|
||||
|
||||
@Test
|
||||
void whenValidateValidEnglishInputStreamByTika_thenReturnsISO_88591_1() throws IOException {
|
||||
|
||||
EncodingDetector encodingDetector = new UniversalEncodingDetector();
|
||||
Charset detectedCharset = encodingDetector.detect(ENGLISH_INPUTSTREAM, new Metadata());
|
||||
assertEquals(StandardCharsets.ISO_8859_1, detectedCharset);
|
||||
}
|
||||
|
||||
@Test
|
||||
void whenValidateInvalidInputStreamByTika_thenReturnsNull() throws IOException {
|
||||
|
||||
EncodingDetector encodingDetector = new UniversalEncodingDetector();
|
||||
Charset detectedCharset = encodingDetector.detect(INVALID_UTF8_INPUTSTREAM, new Metadata());
|
||||
assertNull(detectedCharset);
|
||||
}
|
||||
|
||||
@Test
|
||||
void whenValidateValidInputStreamByICU4J_thenReturnsUTF8() throws IOException {
|
||||
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
detector.setText(UTF8_INPUTSTREAM);
|
||||
CharsetMatch charsetMatch = detector.detect();
|
||||
assertEquals(StandardCharsets.UTF_8.name(), charsetMatch.getName());
|
||||
}
|
||||
|
||||
@Test
|
||||
void whenValidateValidEnglishInputStreamByICU4J_thenReturnsISO_8859_1() throws IOException {
|
||||
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
detector.setText(ENGLISH_INPUTSTREAM);
|
||||
CharsetMatch charsetMatch = detector.detect();
|
||||
assertEquals(StandardCharsets.ISO_8859_1.name(), charsetMatch.getName());
|
||||
}
|
||||
|
||||
@Test
|
||||
void whenValidateValidInputStreamByICU4J_thenReturnsNotEqualToUTF_8() throws IOException {
|
||||
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
detector.setText(INVALID_UTF8_INPUTSTREAM);
|
||||
CharsetMatch charsetMatch = detector.detect();
|
||||
assertNotEquals(StandardCharsets.UTF_8.name(), charsetMatch.getName());
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue