LANG-955: Add methods for removing all invalid characters according to XML 1.0 and XML 1.1 in an input string to StringEscapeUtils. Thanks to Adam Hooper

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1568639 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Benedikt Ritter 2014-02-15 16:13:27 +00:00
parent ab14240150
commit 74fa00c3c5
5 changed files with 279 additions and 0 deletions

View File

@ -22,6 +22,7 @@
<body>
<release version="3.3" date="TBA" description="Bugfix and Feature release">
<action issue="LANG-955" type="add" dev="britter" due-to="Adam Hooper">Add methods for removing all invalid characters according to XML 1.0 and XML 1.1 in an input string to StringEscapeUtils</action>
<action issue="LANG-977" type="fix" dev="britter" due-to="Chris Karcher">NumericEntityEscaper incorrectly encodes supplementary characters</action>
<action issue="LANG-973" type="fix" dev="sebb">Make some private fields final</action>
<action issue="LANG-971" type="fix" dev="sebb">NumberUtils#isNumber(String) fails to reject invalid Octal numbers</action>

View File

@ -24,9 +24,11 @@
import org.apache.commons.lang3.text.translate.EntityArrays;
import org.apache.commons.lang3.text.translate.JavaUnicodeEscaper;
import org.apache.commons.lang3.text.translate.LookupTranslator;
import org.apache.commons.lang3.text.translate.NumericEntityEscaper;
import org.apache.commons.lang3.text.translate.NumericEntityUnescaper;
import org.apache.commons.lang3.text.translate.OctalUnescaper;
import org.apache.commons.lang3.text.translate.UnicodeUnescaper;
import org.apache.commons.lang3.text.translate.UnicodeUnpairedSurrogateRemover;
/**
* <p>Escapes and unescapes {@code String}s for
@ -111,12 +113,94 @@ public class StringEscapeUtils {
* as the foundation for a custom translator.
*
* @since 3.0
* @deprecated use {@link #ESCAPE_XML10} or {@link #ESCAPE_XML11} instead.
*/
@Deprecated
public static final CharSequenceTranslator ESCAPE_XML =
new AggregateTranslator(
new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
new LookupTranslator(EntityArrays.APOS_ESCAPE())
);
/**
* Translator object for escaping XML 1.0.
*
* While {@link #escapeXml10(String)} is the expected method of use, this
* object allows the XML escaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.3
*/
public static final CharSequenceTranslator ESCAPE_XML10 =
new AggregateTranslator(
new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
new LookupTranslator(EntityArrays.APOS_ESCAPE()),
new LookupTranslator(
new String[][] {
{ "\u0000", "" },
{ "\u0001", "" },
{ "\u0002", "" },
{ "\u0003", "" },
{ "\u0004", "" },
{ "\u0005", "" },
{ "\u0006", "" },
{ "\u0007", "" },
{ "\u0008", "" },
{ "\u000b", "" },
{ "\u000c", "" },
{ "\u000e", "" },
{ "\u000f", "" },
{ "\u0010", "" },
{ "\u0011", "" },
{ "\u0012", "" },
{ "\u0013", "" },
{ "\u0014", "" },
{ "\u0015", "" },
{ "\u0016", "" },
{ "\u0017", "" },
{ "\u0018", "" },
{ "\u0019", "" },
{ "\u001a", "" },
{ "\u001b", "" },
{ "\u001c", "" },
{ "\u001d", "" },
{ "\u001e", "" },
{ "\u001f", "" },
{ "\ufffe", "" },
{ "\uffff", "" }
}),
NumericEntityEscaper.between(0x7f, 0x84),
NumericEntityEscaper.between(0x86, 0x9f),
new UnicodeUnpairedSurrogateRemover()
);
/**
* Translator object for escaping XML 1.1.
*
* While {@link #escapeXml11(String)} is the expected method of use, this
* object allows the XML escaping functionality to be used
* as the foundation for a custom translator.
*
* @since 3.3
*/
public static final CharSequenceTranslator ESCAPE_XML11 =
new AggregateTranslator(
new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
new LookupTranslator(EntityArrays.APOS_ESCAPE()),
new LookupTranslator(
new String[][] {
{ "\u0000", "" },
{ "\u000b", "&#11;" },
{ "\u000c", "&#12;" },
{ "\ufffe", "" },
{ "\uffff", "" }
}),
NumericEntityEscaper.between(0x1, 0x8),
NumericEntityEscaper.between(0xe, 0x1f),
NumericEntityEscaper.between(0x7f, 0x84),
NumericEntityEscaper.between(0x86, 0x9f),
new UnicodeUnpairedSurrogateRemover()
);
/**
* Translator object for escaping HTML version 3.0.
@ -579,11 +663,76 @@ public static final String unescapeHtml3(final String input) {
* @param input the {@code String} to escape, may be null
* @return a new escaped {@code String}, {@code null} if null string input
* @see #unescapeXml(java.lang.String)
* @deprecated use {@link #escapeXml10(java.lang.String)} or {@link #escapeXml11(java.lang.String)} instead.
*/
@Deprecated
@SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3
public static final String escapeXml(final String input) {
return ESCAPE_XML.translate(input);
}
/**
* <p>Escapes the characters in a {@code String} using XML entities.</p>
*
* <p>For example: <tt>"bread" & "butter"</tt> =>
* <tt>&amp;quot;bread&amp;quot; &amp;amp; &amp;quot;butter&amp;quot;</tt>.
* </p>
*
* <p>Note that XML 1.0 is a text-only format: it cannot represent control
* characters or unpaired Unicode surrogate codepoints, even after escaping.
* {@code escapeXml10} will remove characters that do not fit in the
* following ranges:</p>
*
* <p>{@code #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}</p>
*
* <p>Though not strictly necessary, {@code escapeXml10} will escape
* characters in the following ranges:</p>
*
* <p>{@code [#x7F-#x84] | [#x86-#x9F]}</p>
*
* <p>The returned string can be inserted into a valid XML 1.0 or XML 1.1
* document. If you want to allow more non-text characters in an XML 1.1
* document, use {@link #escapeXml11(String)}.</p>
*
* @param input the {@code String} to escape, may be null
* @return a new escaped {@code String}, {@code null} if null string input
* @see #unescapeXml(java.lang.String)
* @since 3.3
*/
public static String escapeXml10(final String input) {
return ESCAPE_XML10.translate(input);
}
/**
* <p>Escapes the characters in a {@code String} using XML entities.</p>
*
* <p>For example: <tt>"bread" & "butter"</tt> =>
* <tt>&amp;quot;bread&amp;quot; &amp;amp; &amp;quot;butter&amp;quot;</tt>.
* </p>
*
* <p>XML 1.1 can represent certain control characters, but it cannot represent
* the null byte or unpaired Unicode surrogate codepoints, even after escaping.
* {@code escapeXml11} will remove characters that do not fit in the following
* ranges:</p>
*
* <p>{@code [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}</p>
*
* <p>{@code escapeXml11} will escape characters in the following ranges:</p>
*
* <p>{@code [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]}</p>
*
* <p>The returned string can be inserted into a valid XML 1.1 document. Do not
* use it for XML 1.0 documents.</p>
*
* @param input the {@code String} to escape, may be null
* @return a new escaped {@code String}, {@code null} if null string input
* @see #unescapeXml(java.lang.String)
* @since 3.3
*/
public static String escapeXml11(final String input) {
return ESCAPE_XML11.translate(input);
}
//-----------------------------------------------------------------------
/**
* <p>Unescapes a string containing XML entity escapes to a string
@ -599,6 +748,8 @@ public static final String escapeXml(final String input) {
* @param input the {@code String} to unescape, may be null
* @return a new unescaped {@code String}, {@code null} if null string input
* @see #escapeXml(String)
* @see #escapeXml10(String)
* @see #escapeXml11(String)
*/
public static final String unescapeXml(final String input) {
return UNESCAPE_XML.translate(input);

View File

@ -0,0 +1,43 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.lang3.text.translate;
import java.io.IOException;
import java.io.Writer;
/**
* Helper subclass to CharSequenceTranslator to remove unpaired surrogates.
*
* @version $Id$
*/
public class UnicodeUnpairedSurrogateRemover extends CodePointTranslator {
/**
* Implementation of translate that throws out unpaired surrogates.
* {@inheritDoc}
*/
@Override
public boolean translate(int codepoint, Writer out) throws IOException {
if (codepoint >= Character.MIN_SURROGATE && codepoint <= Character.MAX_SURROGATE) {
// It's a surrogate. Write nothing and say we've translated.
return true;
} else {
// It's not a surrogate. Don't translate it.
return false;
}
}
}

View File

@ -291,6 +291,7 @@ public void testEscapeHtmlVersions() throws Exception {
}
@Test
@SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3
public void testEscapeXml() throws Exception {
assertEquals("&lt;abc&gt;", StringEscapeUtils.escapeXml("<abc>"));
assertEquals("<abc>", StringEscapeUtils.unescapeXml("&lt;abc&gt;"));
@ -326,6 +327,38 @@ public void testEscapeXml() throws Exception {
}
assertEquals("XML was unescaped incorrectly", "<abc>", sw.toString() );
}
@Test
public void testEscapeXml10() throws Exception {
assertEquals("a&lt;b&gt;c&quot;d&apos;e&amp;f", StringEscapeUtils.escapeXml10("a<b>c\"d'e&f"));
assertEquals("XML 1.0 should not escape \t \n \r",
"a\tb\rc\nd", StringEscapeUtils.escapeXml10("a\tb\rc\nd"));
assertEquals("XML 1.0 should omit most #x0-x8 | #xb | #xc | #xe-#x19",
"ab", StringEscapeUtils.escapeXml10("a\u0000\u0001\u0008\u000b\u000c\u000e\u001fb"));
assertEquals("XML 1.0 should omit #xd800-#xdfff",
"a\ud7ff \ue000b", StringEscapeUtils.escapeXml10("a\ud7ff\ud800 \udfff \ue000b"));
assertEquals("XML 1.0 should omit #xfffe | #xffff",
"a\ufffdb", StringEscapeUtils.escapeXml10("a\ufffd\ufffe\uffffb"));
assertEquals("XML 1.0 should escape #x7f-#x84 | #x86 - #x9f, for XML 1.1 compatibility",
"a\u007e&#127;&#132;\u0085&#134;&#159;\u00a0b", StringEscapeUtils.escapeXml10("a\u007e\u007f\u0084\u0085\u0086\u009f\u00a0b"));
}
@Test
public void testEscapeXml11() throws Exception {
assertEquals("a&lt;b&gt;c&quot;d&apos;e&amp;f", StringEscapeUtils.escapeXml11("a<b>c\"d'e&f"));
assertEquals("XML 1.1 should not escape \t \n \r",
"a\tb\rc\nd", StringEscapeUtils.escapeXml11("a\tb\rc\nd"));
assertEquals("XML 1.1 should omit #x0",
"ab", StringEscapeUtils.escapeXml11("a\u0000b"));
assertEquals("XML 1.1 should escape #x1-x8 | #xb | #xc | #xe-#x19",
"a&#1;&#8;&#11;&#12;&#14;&#31;b", StringEscapeUtils.escapeXml11("a\u0001\u0008\u000b\u000c\u000e\u001fb"));
assertEquals("XML 1.1 should escape #x7F-#x84 | #x86-#x9F",
"a\u007e&#127;&#132;\u0085&#134;&#159;\u00a0b", StringEscapeUtils.escapeXml11("a\u007e\u007f\u0084\u0085\u0086\u009f\u00a0b"));
assertEquals("XML 1.1 should omit #xd800-#xdfff",
"a\ud7ff \ue000b", StringEscapeUtils.escapeXml11("a\ud7ff\ud800 \udfff \ue000b"));
assertEquals("XML 1.1 should omit #xfffe | #xffff",
"a\ufffdb", StringEscapeUtils.escapeXml11("a\ufffd\ufffe\uffffb"));
}
/**
* Tests Supplementary characters.
@ -342,6 +375,7 @@ public void testEscapeXml() throws Exception {
* @see <a href="https://issues.apache.org/jira/browse/LANG-728">LANG-728</a>
*/
@Test
@SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3
public void testEscapeXmlSupplementaryCharacters() {
final CharSequenceTranslator escapeXml =
StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) );
@ -354,6 +388,7 @@ public void testEscapeXmlSupplementaryCharacters() {
}
@Test
@SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3
public void testEscapeXmlAllCharacters() {
// http://www.w3.org/TR/xml/#charsets says:
// Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character,
@ -534,6 +569,7 @@ public void testLang708() throws IOException {
* Tests https://issues.apache.org/jira/browse/LANG-720
*/
@Test
@SuppressWarnings( "deprecation" ) // escapeXml(String) has been replaced by escapeXml10(String) and escapeXml11(String) in 3.3
public void testLang720() {
final String input = new StringBuilder("\ud842\udfb7").append("A").toString();
final String escaped = StringEscapeUtils.escapeXml(input);

View File

@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.lang3.text.translate;
import static org.junit.Assert.*;
import java.io.CharArrayWriter;
import java.io.IOException;
import org.junit.Test;
/**
* Unit tests for {@link org.apache.commons.lang3.text.translate.UnicodeUnpairedSurrogateRemover}.
*
* @version $Id$
*/
public class UnicodeUnpairedSurrogateRemoverTest {
final UnicodeUnpairedSurrogateRemover subject = new UnicodeUnpairedSurrogateRemover();
final CharArrayWriter writer = new CharArrayWriter(); // nothing is ever written to it
@Test
public void testValidCharacters() throws IOException {
assertEquals(false, subject.translate(0xd7ff, writer));
assertEquals(false, subject.translate(0xe000, writer));
assertEquals(0, writer.size());
}
@Test
public void testInvalidCharacters() throws IOException {
assertEquals(true, subject.translate(0xd800, writer));
assertEquals(true, subject.translate(0xdfff, writer));
assertEquals(0, writer.size());
}
}