diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SequenceFile.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SequenceFile.java index 8a14860773c..2d42a939977 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SequenceFile.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SequenceFile.java @@ -1858,10 +1858,10 @@ public class SequenceFile { UTF8 className = new UTF8(); className.readFields(in); - keyClassName = className.toString(); // key class name + keyClassName = className.toStringChecked(); // key class name className.readFields(in); - valClassName = className.toString(); // val class name + valClassName = className.toStringChecked(); // val class name } else { keyClassName = Text.readString(in); valClassName = Text.readString(in); diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/UTF8.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/UTF8.java index 4124949a4fc..89f1e428bb3 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/UTF8.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/UTF8.java @@ -21,6 +21,7 @@ package org.apache.hadoop.io; import java.io.IOException; import java.io.DataInput; import java.io.DataOutput; +import java.io.UTFDataFormatException; import org.apache.hadoop.util.StringUtils; @@ -155,6 +156,21 @@ public class UTF8 implements WritableComparable { } return buffer.toString(); } + + /** + * Convert to a string, checking for valid UTF8. + * @return the converted string + * @throws UTFDataFormatException if the underlying bytes contain invalid + * UTF8 data. + */ + public String toStringChecked() throws IOException { + StringBuilder buffer = new StringBuilder(length); + synchronized (IBUF) { + IBUF.reset(bytes, length); + readChars(IBUF, buffer, length); + } + return buffer.toString(); + } /** Returns true iff o is a UTF8 with the same contents. */ @Override @@ -238,7 +254,7 @@ public class UTF8 implements WritableComparable { } private static void readChars(DataInput in, StringBuilder buffer, int nBytes) - throws IOException { + throws UTFDataFormatException, IOException { DataOutputBuffer obuf = OBUF_FACTORY.get(); obuf.reset(); obuf.write(in, nBytes); @@ -250,15 +266,27 @@ public class UTF8 implements WritableComparable { // 0b0xxxxxxx: 1-byte sequence buffer.append((char)(b & 0x7F)); } else if ((b & 0xE0) == 0xC0) { + if (i >= nBytes) { + throw new UTFDataFormatException("Truncated UTF8 at " + + StringUtils.byteToHexString(bytes, i - 1, 1)); + } // 0b110xxxxx: 2-byte sequence buffer.append((char)(((b & 0x1F) << 6) | (bytes[i++] & 0x3F))); } else if ((b & 0xF0) == 0xE0) { // 0b1110xxxx: 3-byte sequence + if (i + 1 >= nBytes) { + throw new UTFDataFormatException("Truncated UTF8 at " + + StringUtils.byteToHexString(bytes, i - 1, 2)); + } buffer.append((char)(((b & 0x0F) << 12) | ((bytes[i++] & 0x3F) << 6) | (bytes[i++] & 0x3F))); } else if ((b & 0xF8) == 0xF0) { + if (i + 2 >= nBytes) { + throw new UTFDataFormatException("Truncated UTF8 at " + + StringUtils.byteToHexString(bytes, i - 1, 3)); + } // 0b11110xxx: 4-byte sequence int codepoint = ((b & 0x07) << 18) @@ -274,8 +302,8 @@ public class UTF8 implements WritableComparable { // Only show the next 6 bytes max in the error code - in case the // buffer is large, this will prevent an exceedingly large message. int endForError = Math.min(i + 5, nBytes); - throw new IOException("Invalid UTF8 at " + - StringUtils.byteToHexString(bytes, i - 1, endForError)); + throw new UTFDataFormatException("Invalid UTF8 at " + + StringUtils.byteToHexString(bytes, i - 1, endForError)); } } } diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/TestUTF8.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/TestUTF8.java index 902f215d06b..b3872248327 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/TestUTF8.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/TestUTF8.java @@ -20,6 +20,7 @@ package org.apache.hadoop.io; import junit.framework.TestCase; import java.io.IOException; +import java.io.UTFDataFormatException; import java.util.Random; import org.apache.hadoop.test.GenericTestUtils; @@ -126,9 +127,9 @@ public class TestUTF8 extends TestCase { try { UTF8.fromBytes(invalid); fail("did not throw an exception"); - } catch (IOException ioe) { + } catch (UTFDataFormatException utfde) { GenericTestUtils.assertExceptionContains( - "Invalid UTF8 at ffff01020304", ioe); + "Invalid UTF8 at ffff01020304", utfde); } } @@ -142,9 +143,27 @@ public class TestUTF8 extends TestCase { try { UTF8.fromBytes(invalid); fail("did not throw an exception"); - } catch (IOException ioe) { + } catch (UTFDataFormatException utfde) { GenericTestUtils.assertExceptionContains( - "Invalid UTF8 at f88880808004", ioe); + "Invalid UTF8 at f88880808004", utfde); + } + } + + /** + * Test that decoding invalid UTF8 due to truncation yields the correct + * exception type. + */ + public void testInvalidUTF8Truncated() throws Exception { + // Truncated CAT FACE character -- this is a 4-byte sequence, but we + // only have the first three bytes. + byte[] truncated = new byte[] { + (byte)0xF0, (byte)0x9F, (byte)0x90 }; + try { + UTF8.fromBytes(truncated); + fail("did not throw an exception"); + } catch (UTFDataFormatException utfde) { + GenericTestUtils.assertExceptionContains( + "Truncated UTF8 at f09f90", utfde); } } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 0c346c7b288..bb3099d9558 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -569,6 +569,9 @@ Release 2.0.3-alpha - Unreleased HDFS-4238. Standby namenode should not do purging of shared storage edits. (todd) + HDFS-4282. TestEditLog.testFuzzSequences FAILED in all pre-commit test + (todd) + BREAKDOWN OF HDFS-3077 SUBTASKS HDFS-3077. Quorum-based protocol for reading and writing edit logs. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageSerialization.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageSerialization.java index 7eda24948af..5649833db7a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageSerialization.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageSerialization.java @@ -197,7 +197,7 @@ public class FSImageSerialization { public static String readString(DataInputStream in) throws IOException { DeprecatedUTF8 ustr = TL_DATA.get().U_STR; ustr.readFields(in); - return ustr.toString(); + return ustr.toStringChecked(); } static String readString_EmptyAsNull(DataInputStream in) throws IOException {