BAEL-7175: UTF-8 Validation in Java (#15428)

This commit is contained in:
Manfred 2023-12-22 02:07:48 +00:00 committed by GitHub
parent ca584307b2
commit 56a4afe866
2 changed files with 125 additions and 0 deletions

View File

@ -24,6 +24,21 @@
<artifactId>commons-text</artifactId>
<version>${commons-text.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${apache.tika.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>${apache.tika.version}</version>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>${icu4j.version}</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
@ -60,7 +75,9 @@
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<apache.commons.lang3.version>3.13.0</apache.commons.lang3.version>
<apache.tika.version>2.9.1</apache.tika.version>
<commons-text.version>1.10.0</commons-text.version>
<icu4j.version>74.1</icu4j.version>
<liquibase.core.version>4.25.0</liquibase.core.version>
</properties>

View File

@ -0,0 +1,108 @@
package com.baeldung.utf8validation;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.txt.UniversalEncodingDetector;
import org.junit.jupiter.api.Test;
import java.io.*;
import java.nio.CharBuffer;
import java.nio.charset.*;
import static org.junit.jupiter.api.Assertions.*;
class UTF8ValidationUnitTest {
private static final String UTF8_STRING = "Hello 你好";
private static final byte[] UTF8_BYTES = UTF8_STRING.getBytes(StandardCharsets.UTF_8);
private static final byte[] INVALID_UTF8_BYTES = {(byte) 0xF0, (byte) 0xC1, (byte) 0x8C, (byte) 0xBC, (byte) 0xD1};
private static final InputStream ENGLISH_INPUTSTREAM = new ByteArrayInputStream("Hello".getBytes(StandardCharsets.UTF_8));
private static final InputStream UTF8_INPUTSTREAM = new ByteArrayInputStream(UTF8_BYTES);
private static final InputStream INVALID_UTF8_INPUTSTREAM = new ByteArrayInputStream(INVALID_UTF8_BYTES);
@Test
void whenConvertValidUTF8BytesToString_thenReturnExpectedString() {
String decodedStr = new String(UTF8_BYTES, StandardCharsets.UTF_8);
assertEquals(UTF8_STRING, decodedStr);
}
@Test
void whenConvertInvalidUTF8BytesToString_thenReturnReplacementCharacters() {
String decodedStr = new String(INVALID_UTF8_BYTES, StandardCharsets.UTF_8);
assertEquals("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", decodedStr);
}
@Test
void whenDecodeValidUTF8Bytes_thenSucceeds() throws CharacterCodingException {
CharsetDecoder charsetDecoder = StandardCharsets.UTF_8.newDecoder();
CharBuffer decodedCharBuffer = charsetDecoder.decode(java.nio.ByteBuffer.wrap(UTF8_BYTES));
assertEquals(UTF8_STRING, decodedCharBuffer.toString());
}
@Test
void whenDecodeInvalidUTF8Bytes_thenThrowsMalformedInputException() {
CharsetDecoder charsetDecoder = StandardCharsets.UTF_8.newDecoder();
assertThrows(MalformedInputException.class,() -> {charsetDecoder.decode(java.nio.ByteBuffer.wrap(INVALID_UTF8_BYTES));});
}
@Test
void whenValidateValidInputStreamByTika_thenReturnsUTF8() throws IOException {
EncodingDetector encodingDetector = new UniversalEncodingDetector();
Charset detectedCharset = encodingDetector.detect(UTF8_INPUTSTREAM, new Metadata());
assertEquals(StandardCharsets.UTF_8, detectedCharset);
}
@Test
void whenValidateValidEnglishInputStreamByTika_thenReturnsISO_88591_1() throws IOException {
EncodingDetector encodingDetector = new UniversalEncodingDetector();
Charset detectedCharset = encodingDetector.detect(ENGLISH_INPUTSTREAM, new Metadata());
assertEquals(StandardCharsets.ISO_8859_1, detectedCharset);
}
@Test
void whenValidateInvalidInputStreamByTika_thenReturnsNull() throws IOException {
EncodingDetector encodingDetector = new UniversalEncodingDetector();
Charset detectedCharset = encodingDetector.detect(INVALID_UTF8_INPUTSTREAM, new Metadata());
assertNull(detectedCharset);
}
@Test
void whenValidateValidInputStreamByICU4J_thenReturnsUTF8() throws IOException {
CharsetDetector detector = new CharsetDetector();
detector.setText(UTF8_INPUTSTREAM);
CharsetMatch charsetMatch = detector.detect();
assertEquals(StandardCharsets.UTF_8.name(), charsetMatch.getName());
}
@Test
void whenValidateValidEnglishInputStreamByICU4J_thenReturnsISO_8859_1() throws IOException {
CharsetDetector detector = new CharsetDetector();
detector.setText(ENGLISH_INPUTSTREAM);
CharsetMatch charsetMatch = detector.detect();
assertEquals(StandardCharsets.ISO_8859_1.name(), charsetMatch.getName());
}
@Test
void whenValidateValidInputStreamByICU4J_thenReturnsNotEqualToUTF_8() throws IOException {
CharsetDetector detector = new CharsetDetector();
detector.setText(INVALID_UTF8_INPUTSTREAM);
CharsetMatch charsetMatch = detector.detect();
assertNotEquals(StandardCharsets.UTF_8.name(), charsetMatch.getName());
}
}