LANG-955: Add methods for removing all invalid characters according to XML 1.0 and XML 1.1 in an input string to StringEscapeUtils. Thanks to Adam Hooper

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1568639 13f79535-47bb-0310-9956-ffa450edef68
2025-02-11 12:35:07 +00:00 · 2014-02-15 16:13:27 +00:00 · 2014-02-15 16:13:27 +00:00 · 74fa00c3c5
commit 74fa00c3c5
parent ab14240150
5 changed files with 279 additions and 0 deletions
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@ -22,6 +22,7 @@
  <body>

  <release version="3.3" date="TBA" description="Bugfix and Feature release">
+    <action issue="LANG-955" type="add" dev="britter" due-to="Adam Hooper">Add methods for removing all invalid characters according to XML 1.0 and XML 1.1 in an input string to StringEscapeUtils</action>
    <action issue="LANG-977" type="fix" dev="britter" due-to="Chris Karcher">NumericEntityEscaper incorrectly encodes supplementary characters</action>
    <action issue="LANG-973" type="fix" dev="sebb">Make some private fields final</action>
    <action issue="LANG-971" type="fix" dev="sebb">NumberUtils#isNumber(String) fails to reject invalid Octal numbers</action>
--- a/src/main/java/org/apache/commons/lang3/StringEscapeUtils.java
+++ b/src/main/java/org/apache/commons/lang3/StringEscapeUtils.java
@ -24,9 +24,11 @@
 import org.apache.commons.lang3.text.translate.EntityArrays;
 import org.apache.commons.lang3.text.translate.JavaUnicodeEscaper;
 import org.apache.commons.lang3.text.translate.LookupTranslator;
+import org.apache.commons.lang3.text.translate.NumericEntityEscaper;
 import org.apache.commons.lang3.text.translate.NumericEntityUnescaper;
 import org.apache.commons.lang3.text.translate.OctalUnescaper;
 import org.apache.commons.lang3.text.translate.UnicodeUnescaper;
+import org.apache.commons.lang3.text.translate.UnicodeUnpairedSurrogateRemover;

 /**
 * <p>Escapes and unescapes {@code String}s for
@ -111,12 +113,94 @@ public class StringEscapeUtils {
     * as the foundation for a custom translator. 
     *
     * @since 3.0
+     * @deprecated use {@link #ESCAPE_XML10} or {@link #ESCAPE_XML11} instead.
     */
+    @Deprecated
    public static final CharSequenceTranslator ESCAPE_XML = 
        new AggregateTranslator(
            new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
            new LookupTranslator(EntityArrays.APOS_ESCAPE())
        );
+    
+    /**
+     * Translator object for escaping XML 1.0.
+     * 
+     * While {@link #escapeXml10(String)} is the expected method of use, this
+     * object allows the XML escaping functionality to be used
+     * as the foundation for a custom translator.
+     *
+     * @since 3.3
+     */
+    public static final CharSequenceTranslator ESCAPE_XML10 =
+        new AggregateTranslator(
+            new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
+            new LookupTranslator(EntityArrays.APOS_ESCAPE()),
+            new LookupTranslator(
+                    new String[][] {
+                            { "\u0000", "" },
+                            { "\u0001", "" },
+                            { "\u0002", "" },
+                            { "\u0003", "" },
+                            { "\u0004", "" },
+                            { "\u0005", "" },
+                            { "\u0006", "" },
+                            { "\u0007", "" },
+                            { "\u0008", "" },
+                            { "\u000b", "" },
+                            { "\u000c", "" },
+                            { "\u000e", "" },
+                            { "\u000f", "" },
+                            { "\u0010", "" },
+                            { "\u0011", "" },
+                            { "\u0012", "" },
+                            { "\u0013", "" },
+                            { "\u0014", "" },
+                            { "\u0015", "" },
+                            { "\u0016", "" },
+                            { "\u0017", "" },
+                            { "\u0018", "" },
+                            { "\u0019", "" },
+                            { "\u001a", "" },
+                            { "\u001b", "" },
+                            { "\u001c", "" },
+                            { "\u001d", "" },
+                            { "\u001e", "" },
+                            { "\u001f", "" },
+                            { "\ufffe", "" },
+                            { "\uffff", "" }
+                    }),
+            NumericEntityEscaper.between(0x7f, 0x84),
+            NumericEntityEscaper.between(0x86, 0x9f),
+            new UnicodeUnpairedSurrogateRemover()
+        );
+    
+    /**
+     * Translator object for escaping XML 1.1.
+     * 
+     * While {@link #escapeXml11(String)} is the expected method of use, this
+     * object allows the XML escaping functionality to be used
+     * as the foundation for a custom translator.
+     *
+     * @since 3.3
+     */
+    public static final CharSequenceTranslator ESCAPE_XML11 =
+        new AggregateTranslator(
+            new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
+            new LookupTranslator(EntityArrays.APOS_ESCAPE()),
+            new LookupTranslator(
+                    new String[][] {
+                            { "\u0000", "" },
+                            { "\u000b", "&#11;" },
+                            { "\u000c", "&#12;" },
+                            { "\ufffe", "" },
+                            { "\uffff", "" }
+                    }),
+            NumericEntityEscaper.between(0x1, 0x8),
+            NumericEntityEscaper.between(0xe, 0x1f),
+            NumericEntityEscaper.between(0x7f, 0x84),
+            NumericEntityEscaper.between(0x86, 0x9f),
+            new UnicodeUnpairedSurrogateRemover()
+        );

    /**
     * Translator object for escaping HTML version 3.0.
@ -579,11 +663,76 @@ public static final String unescapeHtml3(final String input) {
     * @param input  the {@code String} to escape, may be null
     * @return a new escaped {@code String}, {@code null} if null string input
     * @see #unescapeXml(java.lang.String)
+     * @deprecated use {@link #escapeXml10(java.lang.String)} or {@link #escapeXml11(java.lang.String)} instead.
     */
+    @Deprecated
+    @SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3
    public static final String escapeXml(final String input) {
        return ESCAPE_XML.translate(input);
    }

+    /**
+     * <p>Escapes the characters in a {@code String} using XML entities.</p>
+     *
+     * <p>For example: <tt>"bread" & "butter"</tt> =>
+     * <tt>&amp;quot;bread&amp;quot; &amp;amp; &amp;quot;butter&amp;quot;</tt>.
+     * </p>
+     *
+     * <p>Note that XML 1.0 is a text-only format: it cannot represent control
+     * characters or unpaired Unicode surrogate codepoints, even after escaping.
+     * {@code escapeXml10} will remove characters that do not fit in the
+     * following ranges:</p>
+     * 
+     * <p>{@code #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}</p>
+     * 
+     * <p>Though not strictly necessary, {@code escapeXml10} will escape
+     * characters in the following ranges:</p>
+     * 
+     * <p>{@code [#x7F-#x84] | [#x86-#x9F]}</p>
+     * 
+     * <p>The returned string can be inserted into a valid XML 1.0 or XML 1.1
+     * document. If you want to allow more non-text characters in an XML 1.1
+     * document, use {@link #escapeXml11(String)}.</p>
+     *
+     * @param input  the {@code String} to escape, may be null
+     * @return a new escaped {@code String}, {@code null} if null string input
+     * @see #unescapeXml(java.lang.String)
+     * @since 3.3
+     */
+    public static String escapeXml10(final String input) {
+        return ESCAPE_XML10.translate(input);
+    }
+    
+    /**
+     * <p>Escapes the characters in a {@code String} using XML entities.</p>
+     *
+     * <p>For example: <tt>"bread" & "butter"</tt> =>
+     * <tt>&amp;quot;bread&amp;quot; &amp;amp; &amp;quot;butter&amp;quot;</tt>.
+     * </p>
+     *
+     * <p>XML 1.1 can represent certain control characters, but it cannot represent
+     * the null byte or unpaired Unicode surrogate codepoints, even after escaping.
+     * {@code escapeXml11} will remove characters that do not fit in the following
+     * ranges:</p>
+     * 
+     * <p>{@code [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}</p>
+     * 
+     * <p>{@code escapeXml11} will escape characters in the following ranges:</p>
+     * 
+     * <p>{@code [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]}</p>
+     * 
+     * <p>The returned string can be inserted into a valid XML 1.1 document. Do not
+     * use it for XML 1.0 documents.</p>
+     *
+     * @param input  the {@code String} to escape, may be null
+     * @return a new escaped {@code String}, {@code null} if null string input
+     * @see #unescapeXml(java.lang.String)
+     * @since 3.3
+     */
+    public static String escapeXml11(final String input) {
+        return ESCAPE_XML11.translate(input);
+    }
+
    //-----------------------------------------------------------------------
    /**
     * <p>Unescapes a string containing XML entity escapes to a string
@ -599,6 +748,8 @@ public static final String escapeXml(final String input) {
     * @param input  the {@code String} to unescape, may be null
     * @return a new unescaped {@code String}, {@code null} if null string input
     * @see #escapeXml(String)
+     * @see #escapeXml10(String)
+     * @see #escapeXml11(String)
     */
    public static final String unescapeXml(final String input) {
        return UNESCAPE_XML.translate(input);
--- a/src/main/java/org/apache/commons/lang3/text/translate/UnicodeUnpairedSurrogateRemover.java
+++ b/src/main/java/org/apache/commons/lang3/text/translate/UnicodeUnpairedSurrogateRemover.java
@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.lang3.text.translate;
+
+import java.io.IOException;
+import java.io.Writer;
+
+/**
+ * Helper subclass to CharSequenceTranslator to remove unpaired surrogates.
+ * 
+ * @version $Id$
+ */
+public class UnicodeUnpairedSurrogateRemover extends CodePointTranslator {
+    /**
+     * Implementation of translate that throws out unpaired surrogates. 
+     * {@inheritDoc}
+     */
+    @Override
+    public boolean translate(int codepoint, Writer out) throws IOException {
+        if (codepoint >= Character.MIN_SURROGATE && codepoint <= Character.MAX_SURROGATE) {
+            // It's a surrogate. Write nothing and say we've translated.
+            return true;
+        } else {
+            // It's not a surrogate. Don't translate it.
+            return false;
+        }
+    }
+}
+
--- a/src/test/java/org/apache/commons/lang3/StringEscapeUtilsTest.java
+++ b/src/test/java/org/apache/commons/lang3/StringEscapeUtilsTest.java
@ -291,6 +291,7 @@ public void testEscapeHtmlVersions() throws Exception {
    }

    @Test
+    @SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3
    public void testEscapeXml() throws Exception {
        assertEquals("&lt;abc&gt;", StringEscapeUtils.escapeXml("<abc>"));
        assertEquals("<abc>", StringEscapeUtils.unescapeXml("&lt;abc&gt;"));
@ -326,6 +327,38 @@ public void testEscapeXml() throws Exception {
        }
        assertEquals("XML was unescaped incorrectly", "<abc>", sw.toString() );
    }
+    
+    @Test
+    public void testEscapeXml10() throws Exception {
+        assertEquals("a&lt;b&gt;c&quot;d&apos;e&amp;f", StringEscapeUtils.escapeXml10("a<b>c\"d'e&f"));
+        assertEquals("XML 1.0 should not escape \t \n \r",
+                "a\tb\rc\nd", StringEscapeUtils.escapeXml10("a\tb\rc\nd"));
+        assertEquals("XML 1.0 should omit most #x0-x8 | #xb | #xc | #xe-#x19",
+                "ab", StringEscapeUtils.escapeXml10("a\u0000\u0001\u0008\u000b\u000c\u000e\u001fb"));
+        assertEquals("XML 1.0 should omit #xd800-#xdfff",
+                "a\ud7ff  \ue000b", StringEscapeUtils.escapeXml10("a\ud7ff\ud800 \udfff \ue000b"));
+        assertEquals("XML 1.0 should omit #xfffe | #xffff",
+                "a\ufffdb", StringEscapeUtils.escapeXml10("a\ufffd\ufffe\uffffb"));
+        assertEquals("XML 1.0 should escape #x7f-#x84 | #x86 - #x9f, for XML 1.1 compatibility",
+                "a\u007e&#127;&#132;\u0085&#134;&#159;\u00a0b", StringEscapeUtils.escapeXml10("a\u007e\u007f\u0084\u0085\u0086\u009f\u00a0b"));
+    }
+    
+    @Test
+    public void testEscapeXml11() throws Exception {
+        assertEquals("a&lt;b&gt;c&quot;d&apos;e&amp;f", StringEscapeUtils.escapeXml11("a<b>c\"d'e&f"));
+        assertEquals("XML 1.1 should not escape \t \n \r",
+                "a\tb\rc\nd", StringEscapeUtils.escapeXml11("a\tb\rc\nd"));
+        assertEquals("XML 1.1 should omit #x0",
+                "ab", StringEscapeUtils.escapeXml11("a\u0000b"));
+        assertEquals("XML 1.1 should escape #x1-x8 | #xb | #xc | #xe-#x19",
+                "a&#1;&#8;&#11;&#12;&#14;&#31;b", StringEscapeUtils.escapeXml11("a\u0001\u0008\u000b\u000c\u000e\u001fb"));
+        assertEquals("XML 1.1 should escape #x7F-#x84 | #x86-#x9F",
+                "a\u007e&#127;&#132;\u0085&#134;&#159;\u00a0b", StringEscapeUtils.escapeXml11("a\u007e\u007f\u0084\u0085\u0086\u009f\u00a0b"));
+        assertEquals("XML 1.1 should omit #xd800-#xdfff",
+                "a\ud7ff  \ue000b", StringEscapeUtils.escapeXml11("a\ud7ff\ud800 \udfff \ue000b"));
+        assertEquals("XML 1.1 should omit #xfffe | #xffff",
+                "a\ufffdb", StringEscapeUtils.escapeXml11("a\ufffd\ufffe\uffffb"));
+    }

    /**
     * Tests Supplementary characters. 
@ -342,6 +375,7 @@ public void testEscapeXml() throws Exception {
     * @see <a href="https://issues.apache.org/jira/browse/LANG-728">LANG-728</a>
     */
    @Test
+    @SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3
    public void testEscapeXmlSupplementaryCharacters() {
        final CharSequenceTranslator escapeXml = 
            StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) );
@ -354,6 +388,7 @@ public void testEscapeXmlSupplementaryCharacters() {
    }
    
    @Test
+    @SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3
    public void testEscapeXmlAllCharacters() {
        // http://www.w3.org/TR/xml/#charsets says:
        // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character,
@ -534,6 +569,7 @@ public void testLang708() throws IOException {
     * Tests https://issues.apache.org/jira/browse/LANG-720
     */
    @Test
+    @SuppressWarnings( "deprecation" ) // escapeXml(String) has been replaced by escapeXml10(String) and escapeXml11(String) in 3.3
    public void testLang720() {
        final String input = new StringBuilder("\ud842\udfb7").append("A").toString();
        final String escaped = StringEscapeUtils.escapeXml(input);
--- a/src/test/java/org/apache/commons/lang3/text/translate/UnicodeUnpairedSurrogateRemoverTest.java
+++ b/src/test/java/org/apache/commons/lang3/text/translate/UnicodeUnpairedSurrogateRemoverTest.java
@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.lang3.text.translate;
+
+import static org.junit.Assert.*;
+
+import java.io.CharArrayWriter;
+import java.io.IOException;
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link org.apache.commons.lang3.text.translate.UnicodeUnpairedSurrogateRemover}.
+ *
+ * @version $Id$
+ */
+public class UnicodeUnpairedSurrogateRemoverTest {
+    final UnicodeUnpairedSurrogateRemover subject = new UnicodeUnpairedSurrogateRemover();
+    final CharArrayWriter writer = new CharArrayWriter(); // nothing is ever written to it
+    
+    @Test
+    public void testValidCharacters() throws IOException {
+        assertEquals(false, subject.translate(0xd7ff, writer));
+        assertEquals(false, subject.translate(0xe000, writer));
+        assertEquals(0, writer.size());
+    }
+    
+    @Test
+    public void testInvalidCharacters() throws IOException {
+        assertEquals(true, subject.translate(0xd800, writer));
+        assertEquals(true, subject.translate(0xdfff, writer));
+        assertEquals(0, writer.size());
+    }
+}
+