diff --git a/core-java-modules/core-java-string-operations-7/pom.xml b/core-java-modules/core-java-string-operations-7/pom.xml
index 33a74365bc..cea3e32f2f 100644
--- a/core-java-modules/core-java-string-operations-7/pom.xml
+++ b/core-java-modules/core-java-string-operations-7/pom.xml
@@ -24,6 +24,21 @@
commons-text
${commons-text.version}
+
+ org.apache.tika
+ tika-core
+ ${apache.tika.version}
+
+
+ org.apache.tika
+ tika-parsers-standard-package
+ ${apache.tika.version}
+
+
+ com.ibm.icu
+ icu4j
+ ${icu4j.version}
+
org.junit.jupiter
junit-jupiter
@@ -60,7 +75,9 @@
11
11
3.13.0
+ 2.9.1
1.10.0
+ 74.1
4.25.0
diff --git a/core-java-modules/core-java-string-operations-7/src/test/java/com/baeldung/utf8validation/UTF8ValidationUnitTest.java b/core-java-modules/core-java-string-operations-7/src/test/java/com/baeldung/utf8validation/UTF8ValidationUnitTest.java
new file mode 100644
index 0000000000..7e737bad1d
--- /dev/null
+++ b/core-java-modules/core-java-string-operations-7/src/test/java/com/baeldung/utf8validation/UTF8ValidationUnitTest.java
@@ -0,0 +1,108 @@
+package com.baeldung.utf8validation;
+
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.txt.UniversalEncodingDetector;
+import org.junit.jupiter.api.Test;
+
+import java.io.*;
+import java.nio.CharBuffer;
+import java.nio.charset.*;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class UTF8ValidationUnitTest {
+
+ private static final String UTF8_STRING = "Hello 你好";
+
+ private static final byte[] UTF8_BYTES = UTF8_STRING.getBytes(StandardCharsets.UTF_8);
+
+ private static final byte[] INVALID_UTF8_BYTES = {(byte) 0xF0, (byte) 0xC1, (byte) 0x8C, (byte) 0xBC, (byte) 0xD1};
+
+ private static final InputStream ENGLISH_INPUTSTREAM = new ByteArrayInputStream("Hello".getBytes(StandardCharsets.UTF_8));
+
+ private static final InputStream UTF8_INPUTSTREAM = new ByteArrayInputStream(UTF8_BYTES);
+
+ private static final InputStream INVALID_UTF8_INPUTSTREAM = new ByteArrayInputStream(INVALID_UTF8_BYTES);
+
+ @Test
+ void whenConvertValidUTF8BytesToString_thenReturnExpectedString() {
+ String decodedStr = new String(UTF8_BYTES, StandardCharsets.UTF_8);
+ assertEquals(UTF8_STRING, decodedStr);
+ }
+
+ @Test
+ void whenConvertInvalidUTF8BytesToString_thenReturnReplacementCharacters() {
+ String decodedStr = new String(INVALID_UTF8_BYTES, StandardCharsets.UTF_8);
+ assertEquals("�����", decodedStr);
+ }
+
+ @Test
+ void whenDecodeValidUTF8Bytes_thenSucceeds() throws CharacterCodingException {
+
+ CharsetDecoder charsetDecoder = StandardCharsets.UTF_8.newDecoder();
+ CharBuffer decodedCharBuffer = charsetDecoder.decode(java.nio.ByteBuffer.wrap(UTF8_BYTES));
+ assertEquals(UTF8_STRING, decodedCharBuffer.toString());
+ }
+
+ @Test
+ void whenDecodeInvalidUTF8Bytes_thenThrowsMalformedInputException() {
+
+ CharsetDecoder charsetDecoder = StandardCharsets.UTF_8.newDecoder();
+ assertThrows(MalformedInputException.class,() -> {charsetDecoder.decode(java.nio.ByteBuffer.wrap(INVALID_UTF8_BYTES));});
+ }
+
+ @Test
+ void whenValidateValidInputStreamByTika_thenReturnsUTF8() throws IOException {
+
+ EncodingDetector encodingDetector = new UniversalEncodingDetector();
+ Charset detectedCharset = encodingDetector.detect(UTF8_INPUTSTREAM, new Metadata());
+ assertEquals(StandardCharsets.UTF_8, detectedCharset);
+ }
+
+ @Test
+ void whenValidateValidEnglishInputStreamByTika_thenReturnsISO_88591_1() throws IOException {
+
+ EncodingDetector encodingDetector = new UniversalEncodingDetector();
+ Charset detectedCharset = encodingDetector.detect(ENGLISH_INPUTSTREAM, new Metadata());
+ assertEquals(StandardCharsets.ISO_8859_1, detectedCharset);
+ }
+
+ @Test
+ void whenValidateInvalidInputStreamByTika_thenReturnsNull() throws IOException {
+
+ EncodingDetector encodingDetector = new UniversalEncodingDetector();
+ Charset detectedCharset = encodingDetector.detect(INVALID_UTF8_INPUTSTREAM, new Metadata());
+ assertNull(detectedCharset);
+ }
+
+ @Test
+ void whenValidateValidInputStreamByICU4J_thenReturnsUTF8() throws IOException {
+
+ CharsetDetector detector = new CharsetDetector();
+ detector.setText(UTF8_INPUTSTREAM);
+ CharsetMatch charsetMatch = detector.detect();
+ assertEquals(StandardCharsets.UTF_8.name(), charsetMatch.getName());
+ }
+
+ @Test
+ void whenValidateValidEnglishInputStreamByICU4J_thenReturnsISO_8859_1() throws IOException {
+
+ CharsetDetector detector = new CharsetDetector();
+ detector.setText(ENGLISH_INPUTSTREAM);
+ CharsetMatch charsetMatch = detector.detect();
+ assertEquals(StandardCharsets.ISO_8859_1.name(), charsetMatch.getName());
+ }
+
+ @Test
+ void whenValidateValidInputStreamByICU4J_thenReturnsNotEqualToUTF_8() throws IOException {
+
+ CharsetDetector detector = new CharsetDetector();
+ detector.setText(INVALID_UTF8_INPUTSTREAM);
+ CharsetMatch charsetMatch = detector.detect();
+ assertNotEquals(StandardCharsets.UTF_8.name(), charsetMatch.getName());
+ }
+
+}
\ No newline at end of file