From c0683957931ffa3df218ff68ca31e257174a6466 Mon Sep 17 00:00:00 2001 From: Henri Yandell Date: Sun, 1 Mar 2009 20:54:40 +0000 Subject: [PATCH] Applying Alexander Kjall's patch from LANG-480; along with a unit test made from his example. Fixes unicode conversion above U+00FFFF being done into 2 characters git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@749095 13f79535-47bb-0310-9956-ffa450edef68 --- src/java/org/apache/commons/lang/Entities.java | 9 +++++++-- .../apache/commons/lang/StringEscapeUtilsTest.java | 14 ++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/commons/lang/Entities.java b/src/java/org/apache/commons/lang/Entities.java index 900e1a896..0d8f7f4d9 100644 --- a/src/java/org/apache/commons/lang/Entities.java +++ b/src/java/org/apache/commons/lang/Entities.java @@ -825,10 +825,15 @@ public String escape(String str) { public void escape(Writer writer, String str) throws IOException { int len = str.length(); for (int i = 0; i < len; i++) { - char c = str.charAt(i); + int c = Character.codePointAt(str, i); String entityName = this.entityName(c); if (entityName == null) { - if (c > 0x7F) { + if (c >= 0x010000 && i < len - 1) { + writer.write("&#"); + writer.write(Integer.toString(c, 10)); + writer.write(';'); + i++; + } else if (c > 0x7F) { writer.write("&#"); writer.write(Integer.toString(c, 10)); writer.write(';'); diff --git a/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java b/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java index e993ae680..18e6233f4 100644 --- a/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java +++ b/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java @@ -415,4 +415,18 @@ private void checkCsvUnescapeWriter(String expected, String value) { fail("Threw: " + e); } } + + // https://issues.apache.org/jira/browse/LANG-480 + public void testEscapeHtmlHighUnicode() throws java.io.UnsupportedEncodingException { + // this is the utf8 representation of the character: + // COUNTING ROD UNIT DIGIT THREE + // in unicode + // codepoint: U+1D362 + byte[] data = new byte[] { (byte)0xF0, (byte)0x9D, (byte)0x8D, (byte)0xA2 }; + + String escaped = StringEscapeUtils.escapeHtml( new String(data, "UTF8") ); + String unescaped = StringEscapeUtils.unescapeHtml( escaped ); + + assertEquals( "High unicode was not escaped correctly", "𝍢", escaped); + } }