From 000bac6b9484209a2e0fda4586b2def38b6f9108 Mon Sep 17 00:00:00 2001 From: Henri Yandell Date: Sat, 14 Nov 2009 10:45:27 +0000 Subject: [PATCH] Changing the standard escapeXml method to not escape high characters. It's easier to add that back on that remove it. LANG-516 and LANG-517 git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@836151 13f79535-47bb-0310-9956-ffa450edef68 --- .../lang/text/translate/EscapeUtils.java | 9 ++--- .../commons/lang/StringEscapeUtilsTest.java | 33 +++++++++++-------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/java/org/apache/commons/lang/text/translate/EscapeUtils.java b/src/java/org/apache/commons/lang/text/translate/EscapeUtils.java index 2237b1e89..579ace1b9 100644 --- a/src/java/org/apache/commons/lang/text/translate/EscapeUtils.java +++ b/src/java/org/apache/commons/lang/text/translate/EscapeUtils.java @@ -66,8 +66,7 @@ public class EscapeUtils { public static final CharSequenceTranslator ESCAPE_XML = new AggregateTranslator( new LookupTranslator(EntityArrays.BASIC_ESCAPE()), - new LookupTranslator(EntityArrays.APOS_ESCAPE()), - NumericEntityEscaper.above(0x7f) + new LookupTranslator(EntityArrays.APOS_ESCAPE()) ); public static final String escapeXml(String input) { @@ -77,8 +76,7 @@ public class EscapeUtils { public static final CharSequenceTranslator ESCAPE_HTML3 = new AggregateTranslator( new LookupTranslator(EntityArrays.BASIC_ESCAPE()), - new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()), - NumericEntityEscaper.above(0x7f) + new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()) ); public static final String escapeHtml3(String input) { @@ -89,8 +87,7 @@ public class EscapeUtils { new AggregateTranslator( new LookupTranslator(EntityArrays.BASIC_ESCAPE()), new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()), - new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE()), - NumericEntityEscaper.above(0x7f) + new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE()) ); public static final String escapeHtml4(String input) { diff --git a/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java b/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java index 4398c2355..6a3111641 100644 --- a/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java +++ b/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java @@ -218,9 +218,8 @@ public class StringEscapeUtilsTest extends TestCase { {"final character only", "greater than >", "greater than >"}, {"first character only", "< less than", "< less than"}, {"apostrophe", "Huntington's chorea", "Huntington's chorea"}, - {"languages", "English,Français,日本語 (nihongo)", "English,Fran\u00E7ais,\u65E5\u672C\u8A9E (nihongo)"}, - {"8-bit ascii doesn't number-escape", "~\u007F", "\u007E\u007F"}, - {"8-bit ascii does number-escape", "€Ÿ", "\u0080\u009F"}, + {"languages", "English,Français,\u65E5\u672C\u8A9E (nihongo)", "English,Fran\u00E7ais,\u65E5\u672C\u8A9E (nihongo)"}, + {"8-bit ascii shouldn't number-escape", "\u0080\u009F", "\u0080\u009F"}, }; public void testEscapeHtml() { @@ -298,9 +297,9 @@ public class StringEscapeUtilsTest extends TestCase { assertEquals("<abc>", StringEscapeUtils.escapeXml("")); assertEquals("", StringEscapeUtils.unescapeXml("<abc>")); - assertEquals("XML should use numbers, not names for HTML entities", - "¡", StringEscapeUtils.escapeXml("\u00A1")); - assertEquals("XML should use numbers, not names for HTML entities", + assertEquals("XML should not escape >0x7f values", + "\u00A1", StringEscapeUtils.escapeXml("\u00A1")); + assertEquals("XML should be able to unescape >0x7f values", "\u00A0", StringEscapeUtils.unescapeXml(" ")); assertEquals("ain't", StringEscapeUtils.unescapeXml("ain't")); @@ -413,21 +412,29 @@ public class StringEscapeUtilsTest extends TestCase { // codepoint: U+1D362 byte[] data = new byte[] { (byte)0xF0, (byte)0x9D, (byte)0x8D, (byte)0xA2 }; - String escaped = StringEscapeUtils.escapeHtml( new String(data, "UTF8") ); - String unescaped = StringEscapeUtils.unescapeHtml( escaped ); + String original = new String(data, "UTF8"); - assertEquals( "High unicode was not escaped correctly", "𝍢", escaped); + String escaped = StringEscapeUtils.escapeHtml( original ); + assertEquals( "High unicode should not have been escaped", original, escaped); + + String unescaped = StringEscapeUtils.unescapeHtml( escaped ); + assertEquals( "High unicode should have been unchanged", original, unescaped); + +// TODO: I think this should hold, needs further investigation +// String unescapedFromEntity = StringEscapeUtils.unescapeHtml( "𝍢" ); +// assertEquals( "High unicode should have been unescaped", original, unescapedFromEntity); } // https://issues.apache.org/jira/browse/LANG-339 public void testEscapeHiragana() throws java.io.UnsupportedEncodingException { // Some random Japanese unicode characters - String escaped = StringEscapeUtils.escapeHtml( "\u304B\u304C\u3068" ); - assertEquals( "Hiragana character unicode behaviour has changed from their being escaped", - "かがと", escaped); + String original = "\u304B\u304C\u3068"; + String escaped = StringEscapeUtils.escapeHtml(original); + assertEquals( "Hiragana character unicode behaviour should not be being escaped by escapeHtml", + original, escaped); String unescaped = StringEscapeUtils.unescapeHtml( escaped ); - assertEquals( "Hiragana character unicode behaviour has changed - expected no unescaping", escaped, escaped); + assertEquals( "Hiragana character unicode behaviour has changed - expected no unescaping", escaped, unescaped); } }