From 8061821ca2d83dcc98358ff0239289ec2f3f42f4 Mon Sep 17 00:00:00 2001 From: Doug Cutting Date: Wed, 10 Feb 2010 21:58:11 +0000 Subject: [PATCH] HADOOP-6522. Fix decoding of codepoint zero in UTF8. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@908661 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 2 ++ src/java/org/apache/hadoop/io/UTF8.java | 4 ++-- src/test/core/org/apache/hadoop/io/TestUTF8.java | 16 +++++++++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 207b49262c4..4611aad3093 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -184,6 +184,8 @@ Trunk (unreleased changes) HADOOP-6540. Contrib unit tests have invalid XML for core-site, etc. (Aaron Kimball via tomwhite) + HADOOP-6522. Fix decoding of codepoint zero in UTF8. (cutting) + Release 0.21.0 - Unreleased INCOMPATIBLE CHANGES diff --git a/src/java/org/apache/hadoop/io/UTF8.java b/src/java/org/apache/hadoop/io/UTF8.java index d9f45f7e6b4..4b3e2379b7a 100644 --- a/src/java/org/apache/hadoop/io/UTF8.java +++ b/src/java/org/apache/hadoop/io/UTF8.java @@ -253,7 +253,7 @@ public class UTF8 implements WritableComparable { int utf8Length = 0; for (int i = 0; i < stringLength; i++) { int c = string.charAt(i); - if ((c >= 0x0001) && (c <= 0x007F)) { + if (c <= 0x007F) { utf8Length++; } else if (c > 0x07FF) { utf8Length += 3; @@ -270,7 +270,7 @@ public class UTF8 implements WritableComparable { final int end = start + length; for (int i = start; i < end; i++) { int code = s.charAt(i); - if (code >= 0x01 && code <= 0x7F) { + if (code <= 0x7F) { out.writeByte((byte)code); } else if (code <= 0x07FF) { out.writeByte((byte)(0xC0 | ((code >> 6) & 0x1F))); diff --git a/src/test/core/org/apache/hadoop/io/TestUTF8.java b/src/test/core/org/apache/hadoop/io/TestUTF8.java index 115fbe96c41..47ad5bd1383 100644 --- a/src/test/core/org/apache/hadoop/io/TestUTF8.java +++ b/src/test/core/org/apache/hadoop/io/TestUTF8.java @@ -22,6 +22,7 @@ import junit.framework.TestCase; import java.util.Random; /** Unit tests for UTF8. */ +@SuppressWarnings("deprecation") public class TestUTF8 extends TestCase { public TestUTF8(String name) { super(name); } @@ -37,13 +38,13 @@ public class TestUTF8 extends TestCase { } public void testWritable() throws Exception { - for (int i = 0; i < 10; i++) { + for (int i = 0; i < 10000; i++) { TestWritable.testWritable(new UTF8(getTestString())); } } public void testGetBytes() throws Exception { - for (int i = 0; i < 10; i++) { + for (int i = 0; i < 10000; i++) { // generate a random string String before = getTestString(); @@ -57,7 +58,7 @@ public class TestUTF8 extends TestCase { DataOutputBuffer out = new DataOutputBuffer(); DataInputBuffer in = new DataInputBuffer(); - for (int i = 0; i < 10; i++) { + for (int i = 0; i < 10000; i++) { // generate a random string String before = getTestString(); @@ -82,5 +83,14 @@ public class TestUTF8 extends TestCase { } } + + public void testNullEncoding() throws Exception { + String s = new String(new char[] { 0 }); + + DataOutputBuffer dob = new DataOutputBuffer(); + new UTF8(s).write(dob); + + assertEquals(s, new String(dob.getData(), 2, dob.getLength()-2, "UTF-8")); + } }