mirror of
https://github.com/apache/commons-lang.git
synced 2025-02-11 12:35:07 +00:00
LANG-955: Add methods for removing all invalid characters according to XML 1.0 and XML 1.1 in an input string to StringEscapeUtils. Thanks to Adam Hooper
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1568639 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ab14240150
commit
74fa00c3c5
@ -22,6 +22,7 @@
|
||||
<body>
|
||||
|
||||
<release version="3.3" date="TBA" description="Bugfix and Feature release">
|
||||
<action issue="LANG-955" type="add" dev="britter" due-to="Adam Hooper">Add methods for removing all invalid characters according to XML 1.0 and XML 1.1 in an input string to StringEscapeUtils</action>
|
||||
<action issue="LANG-977" type="fix" dev="britter" due-to="Chris Karcher">NumericEntityEscaper incorrectly encodes supplementary characters</action>
|
||||
<action issue="LANG-973" type="fix" dev="sebb">Make some private fields final</action>
|
||||
<action issue="LANG-971" type="fix" dev="sebb">NumberUtils#isNumber(String) fails to reject invalid Octal numbers</action>
|
||||
|
@ -24,9 +24,11 @@
|
||||
import org.apache.commons.lang3.text.translate.EntityArrays;
|
||||
import org.apache.commons.lang3.text.translate.JavaUnicodeEscaper;
|
||||
import org.apache.commons.lang3.text.translate.LookupTranslator;
|
||||
import org.apache.commons.lang3.text.translate.NumericEntityEscaper;
|
||||
import org.apache.commons.lang3.text.translate.NumericEntityUnescaper;
|
||||
import org.apache.commons.lang3.text.translate.OctalUnescaper;
|
||||
import org.apache.commons.lang3.text.translate.UnicodeUnescaper;
|
||||
import org.apache.commons.lang3.text.translate.UnicodeUnpairedSurrogateRemover;
|
||||
|
||||
/**
|
||||
* <p>Escapes and unescapes {@code String}s for
|
||||
@ -111,12 +113,94 @@ public class StringEscapeUtils {
|
||||
* as the foundation for a custom translator.
|
||||
*
|
||||
* @since 3.0
|
||||
* @deprecated use {@link #ESCAPE_XML10} or {@link #ESCAPE_XML11} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final CharSequenceTranslator ESCAPE_XML =
|
||||
new AggregateTranslator(
|
||||
new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
|
||||
new LookupTranslator(EntityArrays.APOS_ESCAPE())
|
||||
);
|
||||
|
||||
/**
|
||||
* Translator object for escaping XML 1.0.
|
||||
*
|
||||
* While {@link #escapeXml10(String)} is the expected method of use, this
|
||||
* object allows the XML escaping functionality to be used
|
||||
* as the foundation for a custom translator.
|
||||
*
|
||||
* @since 3.3
|
||||
*/
|
||||
public static final CharSequenceTranslator ESCAPE_XML10 =
|
||||
new AggregateTranslator(
|
||||
new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
|
||||
new LookupTranslator(EntityArrays.APOS_ESCAPE()),
|
||||
new LookupTranslator(
|
||||
new String[][] {
|
||||
{ "\u0000", "" },
|
||||
{ "\u0001", "" },
|
||||
{ "\u0002", "" },
|
||||
{ "\u0003", "" },
|
||||
{ "\u0004", "" },
|
||||
{ "\u0005", "" },
|
||||
{ "\u0006", "" },
|
||||
{ "\u0007", "" },
|
||||
{ "\u0008", "" },
|
||||
{ "\u000b", "" },
|
||||
{ "\u000c", "" },
|
||||
{ "\u000e", "" },
|
||||
{ "\u000f", "" },
|
||||
{ "\u0010", "" },
|
||||
{ "\u0011", "" },
|
||||
{ "\u0012", "" },
|
||||
{ "\u0013", "" },
|
||||
{ "\u0014", "" },
|
||||
{ "\u0015", "" },
|
||||
{ "\u0016", "" },
|
||||
{ "\u0017", "" },
|
||||
{ "\u0018", "" },
|
||||
{ "\u0019", "" },
|
||||
{ "\u001a", "" },
|
||||
{ "\u001b", "" },
|
||||
{ "\u001c", "" },
|
||||
{ "\u001d", "" },
|
||||
{ "\u001e", "" },
|
||||
{ "\u001f", "" },
|
||||
{ "\ufffe", "" },
|
||||
{ "\uffff", "" }
|
||||
}),
|
||||
NumericEntityEscaper.between(0x7f, 0x84),
|
||||
NumericEntityEscaper.between(0x86, 0x9f),
|
||||
new UnicodeUnpairedSurrogateRemover()
|
||||
);
|
||||
|
||||
/**
|
||||
* Translator object for escaping XML 1.1.
|
||||
*
|
||||
* While {@link #escapeXml11(String)} is the expected method of use, this
|
||||
* object allows the XML escaping functionality to be used
|
||||
* as the foundation for a custom translator.
|
||||
*
|
||||
* @since 3.3
|
||||
*/
|
||||
public static final CharSequenceTranslator ESCAPE_XML11 =
|
||||
new AggregateTranslator(
|
||||
new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
|
||||
new LookupTranslator(EntityArrays.APOS_ESCAPE()),
|
||||
new LookupTranslator(
|
||||
new String[][] {
|
||||
{ "\u0000", "" },
|
||||
{ "\u000b", "" },
|
||||
{ "\u000c", "" },
|
||||
{ "\ufffe", "" },
|
||||
{ "\uffff", "" }
|
||||
}),
|
||||
NumericEntityEscaper.between(0x1, 0x8),
|
||||
NumericEntityEscaper.between(0xe, 0x1f),
|
||||
NumericEntityEscaper.between(0x7f, 0x84),
|
||||
NumericEntityEscaper.between(0x86, 0x9f),
|
||||
new UnicodeUnpairedSurrogateRemover()
|
||||
);
|
||||
|
||||
/**
|
||||
* Translator object for escaping HTML version 3.0.
|
||||
@ -579,11 +663,76 @@ public static final String unescapeHtml3(final String input) {
|
||||
* @param input the {@code String} to escape, may be null
|
||||
* @return a new escaped {@code String}, {@code null} if null string input
|
||||
* @see #unescapeXml(java.lang.String)
|
||||
* @deprecated use {@link #escapeXml10(java.lang.String)} or {@link #escapeXml11(java.lang.String)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
@SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3
|
||||
public static final String escapeXml(final String input) {
|
||||
return ESCAPE_XML.translate(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Escapes the characters in a {@code String} using XML entities.</p>
|
||||
*
|
||||
* <p>For example: <tt>"bread" & "butter"</tt> =>
|
||||
* <tt>&quot;bread&quot; &amp; &quot;butter&quot;</tt>.
|
||||
* </p>
|
||||
*
|
||||
* <p>Note that XML 1.0 is a text-only format: it cannot represent control
|
||||
* characters or unpaired Unicode surrogate codepoints, even after escaping.
|
||||
* {@code escapeXml10} will remove characters that do not fit in the
|
||||
* following ranges:</p>
|
||||
*
|
||||
* <p>{@code #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}</p>
|
||||
*
|
||||
* <p>Though not strictly necessary, {@code escapeXml10} will escape
|
||||
* characters in the following ranges:</p>
|
||||
*
|
||||
* <p>{@code [#x7F-#x84] | [#x86-#x9F]}</p>
|
||||
*
|
||||
* <p>The returned string can be inserted into a valid XML 1.0 or XML 1.1
|
||||
* document. If you want to allow more non-text characters in an XML 1.1
|
||||
* document, use {@link #escapeXml11(String)}.</p>
|
||||
*
|
||||
* @param input the {@code String} to escape, may be null
|
||||
* @return a new escaped {@code String}, {@code null} if null string input
|
||||
* @see #unescapeXml(java.lang.String)
|
||||
* @since 3.3
|
||||
*/
|
||||
public static String escapeXml10(final String input) {
|
||||
return ESCAPE_XML10.translate(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Escapes the characters in a {@code String} using XML entities.</p>
|
||||
*
|
||||
* <p>For example: <tt>"bread" & "butter"</tt> =>
|
||||
* <tt>&quot;bread&quot; &amp; &quot;butter&quot;</tt>.
|
||||
* </p>
|
||||
*
|
||||
* <p>XML 1.1 can represent certain control characters, but it cannot represent
|
||||
* the null byte or unpaired Unicode surrogate codepoints, even after escaping.
|
||||
* {@code escapeXml11} will remove characters that do not fit in the following
|
||||
* ranges:</p>
|
||||
*
|
||||
* <p>{@code [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}</p>
|
||||
*
|
||||
* <p>{@code escapeXml11} will escape characters in the following ranges:</p>
|
||||
*
|
||||
* <p>{@code [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]}</p>
|
||||
*
|
||||
* <p>The returned string can be inserted into a valid XML 1.1 document. Do not
|
||||
* use it for XML 1.0 documents.</p>
|
||||
*
|
||||
* @param input the {@code String} to escape, may be null
|
||||
* @return a new escaped {@code String}, {@code null} if null string input
|
||||
* @see #unescapeXml(java.lang.String)
|
||||
* @since 3.3
|
||||
*/
|
||||
public static String escapeXml11(final String input) {
|
||||
return ESCAPE_XML11.translate(input);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* <p>Unescapes a string containing XML entity escapes to a string
|
||||
@ -599,6 +748,8 @@ public static final String escapeXml(final String input) {
|
||||
* @param input the {@code String} to unescape, may be null
|
||||
* @return a new unescaped {@code String}, {@code null} if null string input
|
||||
* @see #escapeXml(String)
|
||||
* @see #escapeXml10(String)
|
||||
* @see #escapeXml11(String)
|
||||
*/
|
||||
public static final String unescapeXml(final String input) {
|
||||
return UNESCAPE_XML.translate(input);
|
||||
|
@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.commons.lang3.text.translate;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
|
||||
/**
|
||||
* Helper subclass to CharSequenceTranslator to remove unpaired surrogates.
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public class UnicodeUnpairedSurrogateRemover extends CodePointTranslator {
|
||||
/**
|
||||
* Implementation of translate that throws out unpaired surrogates.
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public boolean translate(int codepoint, Writer out) throws IOException {
|
||||
if (codepoint >= Character.MIN_SURROGATE && codepoint <= Character.MAX_SURROGATE) {
|
||||
// It's a surrogate. Write nothing and say we've translated.
|
||||
return true;
|
||||
} else {
|
||||
// It's not a surrogate. Don't translate it.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -291,6 +291,7 @@ public void testEscapeHtmlVersions() throws Exception {
|
||||
}
|
||||
|
||||
@Test
|
||||
@SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3
|
||||
public void testEscapeXml() throws Exception {
|
||||
assertEquals("<abc>", StringEscapeUtils.escapeXml("<abc>"));
|
||||
assertEquals("<abc>", StringEscapeUtils.unescapeXml("<abc>"));
|
||||
@ -326,6 +327,38 @@ public void testEscapeXml() throws Exception {
|
||||
}
|
||||
assertEquals("XML was unescaped incorrectly", "<abc>", sw.toString() );
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEscapeXml10() throws Exception {
|
||||
assertEquals("a<b>c"d'e&f", StringEscapeUtils.escapeXml10("a<b>c\"d'e&f"));
|
||||
assertEquals("XML 1.0 should not escape \t \n \r",
|
||||
"a\tb\rc\nd", StringEscapeUtils.escapeXml10("a\tb\rc\nd"));
|
||||
assertEquals("XML 1.0 should omit most #x0-x8 | #xb | #xc | #xe-#x19",
|
||||
"ab", StringEscapeUtils.escapeXml10("a\u0000\u0001\u0008\u000b\u000c\u000e\u001fb"));
|
||||
assertEquals("XML 1.0 should omit #xd800-#xdfff",
|
||||
"a\ud7ff \ue000b", StringEscapeUtils.escapeXml10("a\ud7ff\ud800 \udfff \ue000b"));
|
||||
assertEquals("XML 1.0 should omit #xfffe | #xffff",
|
||||
"a\ufffdb", StringEscapeUtils.escapeXml10("a\ufffd\ufffe\uffffb"));
|
||||
assertEquals("XML 1.0 should escape #x7f-#x84 | #x86 - #x9f, for XML 1.1 compatibility",
|
||||
"a\u007e„\u0085†Ÿ\u00a0b", StringEscapeUtils.escapeXml10("a\u007e\u007f\u0084\u0085\u0086\u009f\u00a0b"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEscapeXml11() throws Exception {
|
||||
assertEquals("a<b>c"d'e&f", StringEscapeUtils.escapeXml11("a<b>c\"d'e&f"));
|
||||
assertEquals("XML 1.1 should not escape \t \n \r",
|
||||
"a\tb\rc\nd", StringEscapeUtils.escapeXml11("a\tb\rc\nd"));
|
||||
assertEquals("XML 1.1 should omit #x0",
|
||||
"ab", StringEscapeUtils.escapeXml11("a\u0000b"));
|
||||
assertEquals("XML 1.1 should escape #x1-x8 | #xb | #xc | #xe-#x19",
|
||||
"ab", StringEscapeUtils.escapeXml11("a\u0001\u0008\u000b\u000c\u000e\u001fb"));
|
||||
assertEquals("XML 1.1 should escape #x7F-#x84 | #x86-#x9F",
|
||||
"a\u007e„\u0085†Ÿ\u00a0b", StringEscapeUtils.escapeXml11("a\u007e\u007f\u0084\u0085\u0086\u009f\u00a0b"));
|
||||
assertEquals("XML 1.1 should omit #xd800-#xdfff",
|
||||
"a\ud7ff \ue000b", StringEscapeUtils.escapeXml11("a\ud7ff\ud800 \udfff \ue000b"));
|
||||
assertEquals("XML 1.1 should omit #xfffe | #xffff",
|
||||
"a\ufffdb", StringEscapeUtils.escapeXml11("a\ufffd\ufffe\uffffb"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests Supplementary characters.
|
||||
@ -342,6 +375,7 @@ public void testEscapeXml() throws Exception {
|
||||
* @see <a href="https://issues.apache.org/jira/browse/LANG-728">LANG-728</a>
|
||||
*/
|
||||
@Test
|
||||
@SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3
|
||||
public void testEscapeXmlSupplementaryCharacters() {
|
||||
final CharSequenceTranslator escapeXml =
|
||||
StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) );
|
||||
@ -354,6 +388,7 @@ public void testEscapeXmlSupplementaryCharacters() {
|
||||
}
|
||||
|
||||
@Test
|
||||
@SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3
|
||||
public void testEscapeXmlAllCharacters() {
|
||||
// http://www.w3.org/TR/xml/#charsets says:
|
||||
// Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character,
|
||||
@ -534,6 +569,7 @@ public void testLang708() throws IOException {
|
||||
* Tests https://issues.apache.org/jira/browse/LANG-720
|
||||
*/
|
||||
@Test
|
||||
@SuppressWarnings( "deprecation" ) // escapeXml(String) has been replaced by escapeXml10(String) and escapeXml11(String) in 3.3
|
||||
public void testLang720() {
|
||||
final String input = new StringBuilder("\ud842\udfb7").append("A").toString();
|
||||
final String escaped = StringEscapeUtils.escapeXml(input);
|
||||
|
@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.commons.lang3.text.translate;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
import java.io.CharArrayWriter;
|
||||
import java.io.IOException;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link org.apache.commons.lang3.text.translate.UnicodeUnpairedSurrogateRemover}.
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public class UnicodeUnpairedSurrogateRemoverTest {
|
||||
final UnicodeUnpairedSurrogateRemover subject = new UnicodeUnpairedSurrogateRemover();
|
||||
final CharArrayWriter writer = new CharArrayWriter(); // nothing is ever written to it
|
||||
|
||||
@Test
|
||||
public void testValidCharacters() throws IOException {
|
||||
assertEquals(false, subject.translate(0xd7ff, writer));
|
||||
assertEquals(false, subject.translate(0xe000, writer));
|
||||
assertEquals(0, writer.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInvalidCharacters() throws IOException {
|
||||
assertEquals(true, subject.translate(0xd800, writer));
|
||||
assertEquals(true, subject.translate(0xdfff, writer));
|
||||
assertEquals(0, writer.size());
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user