diff --git a/src/java/org/apache/commons/lang/Entities.java b/src/java/org/apache/commons/lang/Entities.java new file mode 100644 index 000000000..e5526aaff --- /dev/null +++ b/src/java/org/apache/commons/lang/Entities.java @@ -0,0 +1,417 @@ +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2002-2003 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, if + * any, must include the following acknowlegement: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowlegement may appear in the software itself, + * if and wherever such third-party acknowlegements normally appear. + * + * 4. The names "The Jakarta Project", "Commons", and "Apache Software + * Foundation" must not be used to endorse or promote products derived + * from this software without prior written permission. For written + * permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache" + * nor may "Apache" appear in their names without prior written + * permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ +package org.apache.commons.lang; + +import java.util.Map; +import java.util.HashMap; +import java.io.IOException; +import java.io.Writer; + +import org.apache.commons.lang.exception.NestableRuntimeException; + +//todo: unit test and make public + +/** + *

HTML and XML entity utility class

+ * + *

See + * WebMonkey + * HTML 3.2 + * Alexander Day Chaffee + * @since 2.0 + * @version $Id: Entities.java,v 1.1 2003/04/09 20:55:32 alex Exp $ + */ +class Entities { + + static private Object[][] basic = { + {"quot", "34"}, // " - double-quote + {"amp", "38"}, // & - ampersand + {"lt", "60"}, // < - less-than + {"gt", "62"}, // > - greater-than + }; + + static private Object[][] iso8859_1 = { + {"nbsp", "160"}, // non-breaking space + {"iexcl", "161"}, //inverted exclamation mark + {"cent", "162"}, //cent sign + {"pound", "163"}, //pound sign + {"curren", "164"}, //currency sign + {"yen", "165"}, //yen sign = yuan sign + {"brvbar", "166"}, //broken bar = broken vertical bar + {"sect", "167"}, //section sign + {"uml", "168"}, //diaeresis = spacing diaeresis + {"copy", "169"}, // © - copyright sign + {"ordf", "170"}, //feminine ordinal indicator + {"laquo", "171"}, //left-pointing double angle quotation mark = left pointing guillemet + {"not", "172"}, //not sign + {"shy", "173"}, //soft hyphen = discretionary hyphen + {"reg", "174"}, // ® - registered trademark sign + {"macr", "175"}, //macron = spacing macron = overline = APL overbar + {"deg", "176"}, //degree sign + {"plusmn", "177"}, //plus-minus sign = plus-or-minus sign + {"sup2", "178"}, //superscript two = superscript digit two = squared + {"sup3", "179"}, //superscript three = superscript digit three = cubed + {"acute", "180"}, //acute accent = spacing acute + {"micro", "181"}, //micro sign + {"para", "182"}, //pilcrow sign = paragraph sign + {"middot", "183"}, //middle dot = Georgian comma = Greek middle dot + {"cedil", "184"}, //cedilla = spacing cedilla + {"sup1", "185"}, //superscript one = superscript digit one + {"ordm", "186"}, //masculine ordinal indicator + {"raquo", "187"}, //right-pointing double angle quotation mark = right pointing guillemet + {"frac14", "188"}, //vulgar fraction one quarter = fraction one quarter + {"frac12", "189"}, //vulgar fraction one half = fraction one half + {"frac34", "190"}, //vulgar fraction three quarters = fraction three quarters + {"iquest", "191"}, //inverted question mark = turned question mark + {"Agrave", "192"}, // À - uppercase A, grave accent + {"Aacute", "193"}, // Á - uppercase A, acute accent + {"Acirc", "194"}, // Â - uppercase A, circumflex accent + {"Atilde", "195"}, // Ã - uppercase A, tilde + {"Auml", "196"}, // Ä - uppercase A, umlaut + {"Aring", "197"}, // Å - uppercase A, ring + {"AElig", "198"}, // Æ - uppercase AE + {"Ccedil", "199"}, // Ç - uppercase C, cedilla + {"Egrave", "200"}, // È - uppercase E, grave accent + {"Eacute", "201"}, // É - uppercase E, acute accent + {"Ecirc", "202"}, // Ê - uppercase E, circumflex accent + {"Euml", "203"}, // Ë - uppercase E, umlaut + {"Igrave", "204"}, // Ì - uppercase I, grave accent + {"Iacute", "205"}, // Í - uppercase I, acute accent + {"Icirc", "206"}, // Î - uppercase I, circumflex accent + {"Iuml", "207"}, // Ï - uppercase I, umlaut + {"ETH", "208"}, // Ð - uppercase Eth, Icelandic + {"Ntilde", "209"}, // Ñ - uppercase N, tilde + {"Ograve", "210"}, // Ò - uppercase O, grave accent + {"Oacute", "211"}, // Ó - uppercase O, acute accent + {"Ocirc", "212"}, // Ô - uppercase O, circumflex accent + {"Otilde", "213"}, // Õ - uppercase O, tilde + {"Ouml", "214"}, // Ö - uppercase O, umlaut + {"times", "215"}, //multiplication sign + {"Oslash", "216"}, // Ø - uppercase O, slash + {"Ugrave", "217"}, // Ù - uppercase U, grave accent + {"Uacute", "218"}, // Ú - uppercase U, acute accent + {"Ucirc", "219"}, // Û - uppercase U, circumflex accent + {"Uuml", "220"}, // Ü - uppercase U, umlaut + {"Yacute", "221"}, // Ý - uppercase Y, acute accent + {"THORN", "222"}, // Þ - uppercase THORN, Icelandic + {"szlig", "223"}, // ß - lowercase sharps, German + {"agrave", "224"}, // à - lowercase a, grave accent + {"aacute", "225"}, // á - lowercase a, acute accent + {"acirc", "226"}, // â - lowercase a, circumflex accent + {"atilde", "227"}, // ã - lowercase a, tilde + {"auml", "228"}, // ä - lowercase a, umlaut + {"aring", "229"}, // å - lowercase a, ring + {"aelig", "230"}, // æ - lowercase ae + {"ccedil", "231"}, // ç - lowercase c, cedilla + {"egrave", "232"}, // è - lowercase e, grave accent + {"eacute", "233"}, // é - lowercase e, acute accent + {"ecirc", "234"}, // ê - lowercase e, circumflex accent + {"euml", "235"}, // ë - lowercase e, umlaut + {"igrave", "236"}, // ì - lowercase i, grave accent + {"iacute", "237"}, // í - lowercase i, acute accent + {"icirc", "238"}, // î - lowercase i, circumflex accent + {"iuml", "239"}, // ï - lowercase i, umlaut + {"eth", "240"}, // ð - lowercase eth, Icelandic + {"ntilde", "241"}, // ñ - lowercase n, tilde + {"ograve", "242"}, // ò - lowercase o, grave accent + {"oacute", "243"}, // ó - lowercase o, acute accent + {"ocirc", "244"}, // ô - lowercase o, circumflex accent + {"otilde", "245"}, // õ - lowercase o, tilde + {"ouml", "246"}, // ö - lowercase o, umlaut + {"divide", "247"}, // division sign + {"oslash", "248"}, // ø - lowercase o, slash + {"ugrave", "249"}, // ù - lowercase u, grave accent + {"uacute", "250"}, // ú - lowercase u, acute accent + {"ucirc", "251"}, // û - lowercase u, circumflex accent + {"uuml", "252"}, // ü - lowercase u, umlaut + {"yacute", "253"}, // ý - lowercase y, acute accent + {"thorn", "254"}, // þ - lowercase thorn, Icelandic + {"yuml", "255"}, // ÿ - lowercase y, umlaut + }; + + // http://www.w3.org/TR/REC-html40/sgml/entities.html + static Object[][] html40 = { +// + {"fnof", "402"}, //latin small f with hook = function= florin, U+0192 ISOtech --> +// + {"Alpha", "913"}, //greek capital letter alpha, U+0391 --> + {"Beta", "914"}, //greek capital letter beta, U+0392 --> + {"Gamma", "915"}, //greek capital letter gamma,U+0393 ISOgrk3 --> + {"Delta", "916"}, //greek capital letter delta,U+0394 ISOgrk3 --> + {"Epsilon", "917"}, //greek capital letter epsilon, U+0395 --> + {"Zeta", "918"}, //greek capital letter zeta, U+0396 --> + {"Eta", "919"}, //greek capital letter eta, U+0397 --> + {"Theta", "920"}, //greek capital letter theta,U+0398 ISOgrk3 --> + {"Iota", "921"}, //greek capital letter iota, U+0399 --> + {"Kappa", "922"}, //greek capital letter kappa, U+039A --> + {"Lambda", "923"}, //greek capital letter lambda,U+039B ISOgrk3 --> + {"Mu", "924"}, //greek capital letter mu, U+039C --> + {"Nu", "925"}, //greek capital letter nu, U+039D --> + {"Xi", "926"}, //greek capital letter xi, U+039E ISOgrk3 --> + {"Omicron", "927"}, //greek capital letter omicron, U+039F --> + {"Pi", "928"}, //greek capital letter pi, U+03A0 ISOgrk3 --> + {"Rho", "929"}, //greek capital letter rho, U+03A1 --> +// + {"Sigma", "931"}, //greek capital letter sigma,U+03A3 ISOgrk3 --> + {"Tau", "932"}, //greek capital letter tau, U+03A4 --> + {"Upsilon", "933"}, //greek capital letter upsilon,U+03A5 ISOgrk3 --> + {"Phi", "934"}, //greek capital letter phi,U+03A6 ISOgrk3 --> + {"Chi", "935"}, //greek capital letter chi, U+03A7 --> + {"Psi", "936"}, //greek capital letter psi,U+03A8 ISOgrk3 --> + {"Omega", "937"}, //greek capital letter omega,U+03A9 ISOgrk3 --> + {"alpha", "945"}, //greek small letter alpha,U+03B1 ISOgrk3 --> + {"beta", "946"}, //greek small letter beta, U+03B2 ISOgrk3 --> + {"gamma", "947"}, //greek small letter gamma,U+03B3 ISOgrk3 --> + {"delta", "948"}, //greek small letter delta,U+03B4 ISOgrk3 --> + {"epsilon", "949"}, //greek small letter epsilon,U+03B5 ISOgrk3 --> + {"zeta", "950"}, //greek small letter zeta, U+03B6 ISOgrk3 --> + {"eta", "951"}, //greek small letter eta, U+03B7 ISOgrk3 --> + {"theta", "952"}, //greek small letter theta,U+03B8 ISOgrk3 --> + {"iota", "953"}, //greek small letter iota, U+03B9 ISOgrk3 --> + {"kappa", "954"}, //greek small letter kappa,U+03BA ISOgrk3 --> + {"lambda", "955"}, //greek small letter lambda,U+03BB ISOgrk3 --> + {"mu", "956"}, //greek small letter mu, U+03BC ISOgrk3 --> + {"nu", "957"}, //greek small letter nu, U+03BD ISOgrk3 --> + {"xi", "958"}, //greek small letter xi, U+03BE ISOgrk3 --> + {"omicron", "959"}, //greek small letter omicron, U+03BF NEW --> + {"pi", "960"}, //greek small letter pi, U+03C0 ISOgrk3 --> + {"rho", "961"}, //greek small letter rho, U+03C1 ISOgrk3 --> + {"sigmaf", "962"}, //greek small letter final sigma,U+03C2 ISOgrk3 --> + {"sigma", "963"}, //greek small letter sigma,U+03C3 ISOgrk3 --> + {"tau", "964"}, //greek small letter tau, U+03C4 ISOgrk3 --> + {"upsilon", "965"}, //greek small letter upsilon,U+03C5 ISOgrk3 --> + {"phi", "966"}, //greek small letter phi, U+03C6 ISOgrk3 --> + {"chi", "967"}, //greek small letter chi, U+03C7 ISOgrk3 --> + {"psi", "968"}, //greek small letter psi, U+03C8 ISOgrk3 --> + {"omega", "969"}, //greek small letter omega,U+03C9 ISOgrk3 --> + {"thetasym", "977"}, //greek small letter theta symbol,U+03D1 NEW --> + {"upsih", "978"}, //greek upsilon with hook symbol,U+03D2 NEW --> + {"piv", "982"}, //greek pi symbol, U+03D6 ISOgrk3 --> +// + {"bull", "8226"}, //bullet = black small circle,U+2022 ISOpub --> +// + {"hellip", "8230"}, //horizontal ellipsis = three dot leader,U+2026 ISOpub --> + {"prime", "8242"}, //prime = minutes = feet, U+2032 ISOtech --> + {"Prime", "8243"}, //double prime = seconds = inches,U+2033 ISOtech --> + {"oline", "8254"}, //overline = spacing overscore,U+203E NEW --> + {"frasl", "8260"}, //fraction slash, U+2044 NEW --> +// + {"weierp", "8472"}, //script capital P = power set= Weierstrass p, U+2118 ISOamso --> + {"image", "8465"}, //blackletter capital I = imaginary part,U+2111 ISOamso --> + {"real", "8476"}, //blackletter capital R = real part symbol,U+211C ISOamso --> + {"trade", "8482"}, //trade mark sign, U+2122 ISOnum --> + {"alefsym", "8501"}, //alef symbol = first transfinite cardinal,U+2135 NEW --> +// +// + {"larr", "8592"}, //leftwards arrow, U+2190 ISOnum --> + {"uarr", "8593"}, //upwards arrow, U+2191 ISOnum--> + {"rarr", "8594"}, //rightwards arrow, U+2192 ISOnum --> + {"darr", "8595"}, //downwards arrow, U+2193 ISOnum --> + {"harr", "8596"}, //left right arrow, U+2194 ISOamsa --> + {"crarr", "8629"}, //downwards arrow with corner leftwards= carriage return, U+21B5 NEW --> + {"lArr", "8656"}, //leftwards double arrow, U+21D0 ISOtech --> +// + {"uArr", "8657"}, //upwards double arrow, U+21D1 ISOamsa --> + {"rArr", "8658"}, //rightwards double arrow,U+21D2 ISOtech --> +// + {"dArr", "8659"}, //downwards double arrow, U+21D3 ISOamsa --> + {"hArr", "8660"}, //left right double arrow,U+21D4 ISOamsa --> +// + {"forall", "8704"}, //for all, U+2200 ISOtech --> + {"part", "8706"}, //partial differential, U+2202 ISOtech --> + {"exist", "8707"}, //there exists, U+2203 ISOtech --> + {"empty", "8709"}, //empty set = null set = diameter,U+2205 ISOamso --> + {"nabla", "8711"}, //nabla = backward difference,U+2207 ISOtech --> + {"isin", "8712"}, //element of, U+2208 ISOtech --> + {"notin", "8713"}, //not an element of, U+2209 ISOtech --> + {"ni", "8715"}, //contains as member, U+220B ISOtech --> +// + {"prod", "8719"}, //n-ary product = product sign,U+220F ISOamsb --> +// + {"sum", "8721"}, //n-ary sumation, U+2211 ISOamsb --> +// + {"minus", "8722"}, //minus sign, U+2212 ISOtech --> + {"lowast", "8727"}, //asterisk operator, U+2217 ISOtech --> + {"radic", "8730"}, //square root = radical sign,U+221A ISOtech --> + {"prop", "8733"}, //proportional to, U+221D ISOtech --> + {"infin", "8734"}, //infinity, U+221E ISOtech --> + {"ang", "8736"}, //angle, U+2220 ISOamso --> + {"and", "8743"}, //logical and = wedge, U+2227 ISOtech --> + {"or", "8744"}, //logical or = vee, U+2228 ISOtech --> + {"cap", "8745"}, //intersection = cap, U+2229 ISOtech --> + {"cup", "8746"}, //union = cup, U+222A ISOtech --> + {"int", "8747"}, //integral, U+222B ISOtech --> + {"there4", "8756"}, //therefore, U+2234 ISOtech --> + {"sim", "8764"}, //tilde operator = varies with = similar to,U+223C ISOtech --> +// + {"cong", "8773"}, //approximately equal to, U+2245 ISOtech --> + {"asymp", "8776"}, //almost equal to = asymptotic to,U+2248 ISOamsr --> + {"ne", "8800"}, //not equal to, U+2260 ISOtech --> + {"equiv", "8801"}, //identical to, U+2261 ISOtech --> + {"le", "8804"}, //less-than or equal to, U+2264 ISOtech --> + {"ge", "8805"}, //greater-than or equal to,U+2265 ISOtech --> + {"sub", "8834"}, //subset of, U+2282 ISOtech --> + {"sup", "8835"}, //superset of, U+2283 ISOtech --> +// + {"sube", "8838"}, //subset of or equal to, U+2286 ISOtech --> + {"supe", "8839"}, //superset of or equal to,U+2287 ISOtech --> + {"oplus", "8853"}, //circled plus = direct sum,U+2295 ISOamsb --> + {"otimes", "8855"}, //circled times = vector product,U+2297 ISOamsb --> + {"perp", "8869"}, //up tack = orthogonal to = perpendicular,U+22A5 ISOtech --> + {"sdot", "8901"}, //dot operator, U+22C5 ISOamsb --> +// +// + {"lceil", "8968"}, //left ceiling = apl upstile,U+2308 ISOamsc --> + {"rceil", "8969"}, //right ceiling, U+2309 ISOamsc --> + {"lfloor", "8970"}, //left floor = apl downstile,U+230A ISOamsc --> + {"rfloor", "8971"}, //right floor, U+230B ISOamsc --> + {"lang", "9001"}, //left-pointing angle bracket = bra,U+2329 ISOtech --> +// + {"rang", "9002"}, //right-pointing angle bracket = ket,U+232A ISOtech --> +// +// + {"loz", "9674"}, //lozenge, U+25CA ISOpub --> +// + {"spades", "9824"}, //black spade suit, U+2660 ISOpub --> +// + {"clubs", "9827"}, //black club suit = shamrock,U+2663 ISOpub --> + {"hearts", "9829"}, //black heart suit = valentine,U+2665 ISOpub --> + {"diams", "9830"}, //black diamond suit, U+2666 ISOpub --> + +// + {"OElig", "338"}, // -- latin capital ligature OE,U+0152 ISOlat2 --> + {"oelig", "339"}, // -- latin small ligature oe, U+0153 ISOlat2 --> +// + {"Scaron", "352"}, // -- latin capital letter S with caron,U+0160 ISOlat2 --> + {"scaron", "353"}, // -- latin small letter s with caron,U+0161 ISOlat2 --> + {"Yuml", "376"}, // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 --> +// + {"circ", "710"}, // -- modifier letter circumflex accent,U+02C6 ISOpub --> + {"tilde", "732"}, //small tilde, U+02DC ISOdia --> +// + {"ensp", "8194"}, //en space, U+2002 ISOpub --> + {"emsp", "8195"}, //em space, U+2003 ISOpub --> + {"thinsp", "8201"}, //thin space, U+2009 ISOpub --> + {"zwnj", "8204"}, //zero width non-joiner,U+200C NEW RFC 2070 --> + {"zwj", "8205"}, //zero width joiner, U+200D NEW RFC 2070 --> + {"lrm", "8206"}, //left-to-right mark, U+200E NEW RFC 2070 --> + {"rlm", "8207"}, //right-to-left mark, U+200F NEW RFC 2070 --> + {"ndash", "8211"}, //en dash, U+2013 ISOpub --> + {"mdash", "8212"}, //em dash, U+2014 ISOpub --> + {"lsquo", "8216"}, //left single quotation mark,U+2018 ISOnum --> + {"rsquo", "8217"}, //right single quotation mark,U+2019 ISOnum --> + {"sbquo", "8218"}, //single low-9 quotation mark, U+201A NEW --> + {"ldquo", "8220"}, //left double quotation mark,U+201C ISOnum --> + {"rdquo", "8221"}, //right double quotation mark,U+201D ISOnum --> + {"bdquo", "8222"}, //double low-9 quotation mark, U+201E NEW --> + {"dagger", "8224"}, //dagger, U+2020 ISOpub --> + {"Dagger", "8225"}, //double dagger, U+2021 ISOpub --> + {"permil", "8240"}, //per mille sign, U+2030 ISOtech --> + {"lsaquo", "8249"}, //single left-pointing angle quotation mark,U+2039 ISO proposed --> +// + {"rsaquo", "8250"}, //single right-pointing angle quotation mark,U+203A ISO proposed --> +// + {"euro", "8364"}, // -- euro sign, U+20AC NEW --> + }; + + public static Entities XML; + public static Entities HTML32; + public static Entities HTML40; + + static { + XML = new Entities(); + XML.addEntities(basic); + } + + static { + HTML32 = new Entities(); + HTML32.addEntities(basic); + HTML32.addEntities(iso8859_1); + } + + static { + HTML40 = new Entities(); + HTML40.addEntities(basic); + HTML40.addEntities(iso8859_1); + HTML40.addEntities(html40); + } + + //todo: refactor into a bi-di map object (or look for one and use it) + private Map mapNameToValue; + private Map mapValueToName; + + private Entities() { + mapNameToValue = new HashMap(); + mapValueToName = new HashMap(); + } + + private void addEntities(Object[][] entityArray) { + //todo: analyze whether it's more efficient to use strings or integers as the value + for (int i = 0; i < entityArray.length; ++i) { + mapNameToValue.put(entityArray[i][0], new Integer((String) entityArray[i][1])); + mapValueToName.put(new Integer((String) entityArray[i][1]), entityArray[i][0]); + } + } + + public String entityName(int value) { + return (String) mapValueToName.get(new Integer(value)); + } + + public Integer entityValue(String name) { + return (Integer) mapNameToValue.get(name); + } +} diff --git a/src/java/org/apache/commons/lang/StringEscapeUtils.java b/src/java/org/apache/commons/lang/StringEscapeUtils.java index df5107329..995ff2564 100644 --- a/src/java/org/apache/commons/lang/StringEscapeUtils.java +++ b/src/java/org/apache/commons/lang/StringEscapeUtils.java @@ -75,10 +75,15 @@ import org.apache.commons.lang.exception.NestableRuntimeException; * @author Helge Tesgaard * @author Sean Brown * @since 2.0 - * @version $Id: StringEscapeUtils.java,v 1.5 2003/04/09 18:45:28 alex Exp $ + * @version $Id: StringEscapeUtils.java,v 1.6 2003/04/09 20:55:32 alex Exp $ */ public class StringEscapeUtils { + /** + * The entity set to use when escaping and unescaping HTML + */ + protected static Entities DEFAULT_ENTITIES = Entities.HTML40; + /** *

StringEscapeUtils instances should NOT be constructed in * standard programming. Instead, the class should be used as @@ -351,167 +356,72 @@ public class StringEscapeUtils { // HTML and XML //-------------------------------------------------------------------------- - private static class Entities { - // see http://hotwired.lycos.com/webmonkey/reference/special_characters/ - // see http://www.w3.org/TR/REC-html40/sgml/entities.html - static Object[][] entities = { - // {"#39", new Integer(39)}, // ' - apostrophe - {"quot", new Integer(34)}, // " - double-quote - {"amp", new Integer(38)}, // & - ampersand - {"lt", new Integer(60)}, // < - less-than - {"gt", new Integer(62)}, // > - greater-than - {"nbsp", new Integer(160)}, // non-breaking space - {"iexcl", new Integer(161)}, //inverted exclamation mark - {"cent", new Integer(162)}, //cent sign - {"pound", new Integer(163)}, //pound sign - {"curren", new Integer(164)}, //currency sign - {"yen", new Integer(165)}, //yen sign = yuan sign - {"brvbar", new Integer(166)}, //broken bar = broken vertical bar - {"sect", new Integer(167)}, //section sign - {"uml", new Integer(168)}, //diaeresis = spacing diaeresis - {"copy", new Integer(169)}, // © - copyright sign - {"ordf", new Integer(170)}, //feminine ordinal indicator - {"laquo", new Integer(171)}, //left-pointing double angle quotation mark = left pointing guillemet - {"not", new Integer(172)}, //not sign - {"shy", new Integer(173)}, //soft hyphen = discretionary hyphen - {"reg", new Integer(174)}, // ® - registered trademark sign - {"macr", new Integer(175)}, //macron = spacing macron = overline = APL overbar - {"deg", new Integer(176)}, //degree sign - {"plusmn", new Integer(177)}, //plus-minus sign = plus-or-minus sign - {"sup2", new Integer(178)}, //superscript two = superscript digit two = squared - {"sup3", new Integer(179)}, //superscript three = superscript digit three = cubed - {"acute", new Integer(180)}, //acute accent = spacing acute - {"micro", new Integer(181)}, //micro sign - {"para", new Integer(182)}, //pilcrow sign = paragraph sign - {"middot", new Integer(183)}, //middle dot = Georgian comma = Greek middle dot - {"cedil", new Integer(184)}, //cedilla = spacing cedilla - {"sup1", new Integer(185)}, //superscript one = superscript digit one - {"ordm", new Integer(186)}, //masculine ordinal indicator - {"raquo", new Integer(187)}, //right-pointing double angle quotation mark = right pointing guillemet - {"frac14", new Integer(188)}, //vulgar fraction one quarter = fraction one quarter - {"frac12", new Integer(189)}, //vulgar fraction one half = fraction one half - {"frac34", new Integer(190)}, //vulgar fraction three quarters = fraction three quarters - {"iquest", new Integer(191)}, //inverted question mark = turned question mark - {"Agrave", new Integer(192)}, // À - uppercase A, grave accent - {"Aacute", new Integer(193)}, // Á - uppercase A, acute accent - {"Acirc", new Integer(194)}, // Â - uppercase A, circumflex accent - {"Atilde", new Integer(195)}, // Ã - uppercase A, tilde - {"Auml", new Integer(196)}, // Ä - uppercase A, umlaut - {"Aring", new Integer(197)}, // Å - uppercase A, ring - {"AElig", new Integer(198)}, // Æ - uppercase AE - {"Ccedil", new Integer(199)}, // Ç - uppercase C, cedilla - {"Egrave", new Integer(200)}, // È - uppercase E, grave accent - {"Eacute", new Integer(201)}, // É - uppercase E, acute accent - {"Ecirc", new Integer(202)}, // Ê - uppercase E, circumflex accent - {"Euml", new Integer(203)}, // Ë - uppercase E, umlaut - {"Igrave", new Integer(204)}, // Ì - uppercase I, grave accent - {"Iacute", new Integer(205)}, // Í - uppercase I, acute accent - {"Icirc", new Integer(206)}, // Î - uppercase I, circumflex accent - {"Iuml", new Integer(207)}, // Ï - uppercase I, umlaut - {"ETH", new Integer(208)}, // Ð - uppercase Eth, Icelandic - {"Ntilde", new Integer(209)}, // Ñ - uppercase N, tilde - {"Ograve", new Integer(210)}, // Ò - uppercase O, grave accent - {"Oacute", new Integer(211)}, // Ó - uppercase O, acute accent - {"Ocirc", new Integer(212)}, // Ô - uppercase O, circumflex accent - {"Otilde", new Integer(213)}, // Õ - uppercase O, tilde - {"Ouml", new Integer(214)}, // Ö - uppercase O, umlaut - {"times", new Integer(215)}, //multiplication sign - {"Oslash", new Integer(216)}, // Ø - uppercase O, slash - {"Ugrave", new Integer(217)}, // Ù - uppercase U, grave accent - {"Uacute", new Integer(218)}, // Ú - uppercase U, acute accent - {"Ucirc", new Integer(219)}, // Û - uppercase U, circumflex accent - {"Uuml", new Integer(220)}, // Ü - uppercase U, umlaut - {"Yacute", new Integer(221)}, // Ý - uppercase Y, acute accent - {"THORN", new Integer(222)}, // Þ - uppercase THORN, Icelandic - {"szlig", new Integer(223)}, // ß - lowercase sharps, German - {"agrave", new Integer(224)}, // à - lowercase a, grave accent - {"aacute", new Integer(225)}, // á - lowercase a, acute accent - {"acirc", new Integer(226)}, // â - lowercase a, circumflex accent - {"atilde", new Integer(227)}, // ã - lowercase a, tilde - {"auml", new Integer(228)}, // ä - lowercase a, umlaut - {"aring", new Integer(229)}, // å - lowercase a, ring - {"aelig", new Integer(230)}, // æ - lowercase ae - {"ccedil", new Integer(231)}, // ç - lowercase c, cedilla - {"egrave", new Integer(232)}, // è - lowercase e, grave accent - {"eacute", new Integer(233)}, // é - lowercase e, acute accent - {"ecirc", new Integer(234)}, // ê - lowercase e, circumflex accent - {"euml", new Integer(235)}, // ë - lowercase e, umlaut - {"igrave", new Integer(236)}, // ì - lowercase i, grave accent - {"iacute", new Integer(237)}, // í - lowercase i, acute accent - {"icirc", new Integer(238)}, // î - lowercase i, circumflex accent - {"iuml", new Integer(239)}, // ï - lowercase i, umlaut - {"eth", new Integer(240)}, // ð - lowercase eth, Icelandic - {"ntilde", new Integer(241)}, // ñ - lowercase n, tilde - {"ograve", new Integer(242)}, // ò - lowercase o, grave accent - {"oacute", new Integer(243)}, // ó - lowercase o, acute accent - {"ocirc", new Integer(244)}, // ô - lowercase o, circumflex accent - {"otilde", new Integer(245)}, // õ - lowercase o, tilde - {"ouml", new Integer(246)}, // ö - lowercase o, umlaut - {"divide", new Integer(247)}, // division sign - {"oslash", new Integer(248)}, // ø - lowercase o, slash - {"ugrave", new Integer(249)}, // ù - lowercase u, grave accent - {"uacute", new Integer(250)}, // ú - lowercase u, acute accent - {"ucirc", new Integer(251)}, // û - lowercase u, circumflex accent - {"uuml", new Integer(252)}, // ü - lowercase u, umlaut - {"yacute", new Integer(253)}, // ý - lowercase y, acute accent - {"thorn", new Integer(254)}, // þ - lowercase thorn, Icelandic - {"yuml", new Integer(255)}, // ÿ - lowercase y, umlaut - {"euro", new Integer(8364)}, // Euro symbol - }; - - private Map mapNameToValue; - private Map mapValueToName; - - public Entities() { - mapNameToValue = new HashMap(); - mapValueToName = new HashMap(); - for (int i = 0; i < entities.length; ++i) { - mapNameToValue.put(entities[i][0], entities[i][1]); - mapValueToName.put(entities[i][1], entities[i][0]); - } - } - - public String entityName(int value) { - return (String) mapValueToName.get(new Integer(value)); - } - - public Integer entityValue(String name) { - return (Integer) mapNameToValue.get(name); - } - } - - private static Entities entities; - - private static void initEntities() { - if (entities == null) - entities = new Entities(); - } - - private static String entityName(char ch) { - initEntities(); - return entities.entityName(ch); - } - - private static Integer entityValue(String entity) { - initEntities(); - return entities.entityValue(entity); - } - /** *

Turns funky characters into HTML entity equivalents.

+ *

* For example: "bread" & "butter" => &quot;bread&quot; &amp; &quot;butter&quot;. - * Supports all known HTML entities, including funky accents. See the source code for more detail. - * see http://hotwired.lycos.com/webmonkey/reference/special_characters/ + *

+ *

Supports all known HTML 4.0 entities, including funky accents. + * See the source code for more detail. + *

+ * @see http://hotwired.lycos.com/webmonkey/reference/special_characters/ + * @see Entities * @see #unescapeHtml(String) **/ public static String escapeHtml(String str) { + return escapeEntities(str, Entities.HTML40); + } + + /** + * Given a string containing entity escapes, returns a string + * containing the actual Unicode characters corresponding to the + * escapes. + * + * @see #escapeHtml(String) + **/ + public static String unescapeHtml(String str) { + return unescapeEntities(str, Entities.HTML40); + } + + /** + *

Turns funky characters into XML entity equivalents.

+ *

+ * For example: "bread" & "butter" => &quot;bread&quot; &amp; &quot;butter&quot;. + *

+ *

+ * Supports only the four basic XML entities (gt, lt, quot, amp). + * Does not support DTDs or external entities. + *

+ * @see #unescapeXml(java.lang.String) + **/ + public static String escapeXml(String str) { + return escapeEntities(str, Entities.XML); + } + + /** + *

Given a string containing XML entity escapes, returns a string + * containing the actual Unicode characters corresponding to the + * escapes. + *

+ *

+ * Supports only the four basic XML entities (gt, lt, quot, amp). + * Does not support DTDs or external entities. + *

+ * + * @see #escapeXml(String) + **/ + public static String unescapeXml(String str) { + return unescapeEntities(str, Entities.XML); + } + + private static String escapeEntities(String str, Entities entities) { StringBuffer buf = new StringBuffer(str.length() * 2); int i; for (i = 0; i < str.length(); ++i) { char ch = str.charAt(i); - String entity = entityName(ch); + String entity = entities.entityName(ch); if (entity == null) { - if (((int) ch) > 128) { // should this be 127 or 128? + if (((int) ch) > 0x7F) { int intValue = ((int) ch); buf.append("&#" + intValue + ";"); } else { @@ -524,14 +434,7 @@ public class StringEscapeUtils { return buf.toString(); } - /** - * Given a string containing entity escapes, returns a string - * containing the actual Unicode characters corresponding to the - * escapes. - * - * @see #escapeHtml(String) - **/ - public static String unescapeHtml(String str) { + private static String unescapeEntities(String str, Entities entities) { StringBuffer buf = new StringBuffer(str.length()); int i; for (i = 0; i < str.length(); ++i) { @@ -547,7 +450,7 @@ public class StringEscapeUtils { if (entity.charAt(0) == '#') { iso = new Integer(entity.substring(1)); } else { - iso = entityValue(entity); + iso = entities.entityValue(entity); } if (iso == null) { buf.append("&" + entity + ";"); diff --git a/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java b/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java index 4e8046669..0bf98b7ae 100644 --- a/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java +++ b/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java @@ -66,7 +66,7 @@ import junit.textui.TestRunner; * * @author of original StringUtilsTest.testEscape = ? * @author Alexander Day Chaffee - * @version $Id: StringEscapeUtilsTest.java,v 1.3 2003/04/09 18:45:29 alex Exp $ + * @version $Id: StringEscapeUtilsTest.java,v 1.4 2003/04/09 20:55:33 alex Exp $ */ public class StringEscapeUtilsTest extends TestCase { private final static String FOO = "foo"; @@ -143,8 +143,8 @@ public class StringEscapeUtilsTest extends TestCase { assertEquals("unescape(String) failed" + (message == null ? "" : (": " + message)) + - // we escape this so we can see it in the error message ": expected '" + StringUtils.escape(expected) + + // we escape this so we can see it in the error message "' actual '" + StringUtils.escape(actual) + "'", expected, actual); @@ -159,8 +159,16 @@ public class StringEscapeUtilsTest extends TestCase { } - // HTML + // HTML and XML //-------------------------------------------------------------- + + public void testEntitiesObject() throws Exception + { + assertEquals("gt", Entities.XML.entityName('>')); + assertEquals(new Integer('>'), Entities.XML.entityValue("gt")); + assertEquals(null, Entities.XML.entityValue("xyzzy")); + } + String[][] htmlEscapes = { {"no escaping", "plain text", "plain text"}, {"no escaping", "plain text", "plain text"}, @@ -171,6 +179,8 @@ public class StringEscapeUtilsTest extends TestCase { {"first character only", "< less than", "< less than"}, {"apostrophe", "Huntington's chorea", "Huntington's chorea"}, {"languages", "English,Français,日本語 (nihongo)", "English,Fran\u00E7ais,\u65E5\u672C\u8A9E (nihongo)"}, + {"8-bit ascii doesn't number-escape", "~\u007F", "\u007E\u007F"}, + {"8-bit ascii does number-escape", "€Ÿ", "\u0080\u009F"}, }; public void testEscapeHtml() { @@ -180,7 +190,7 @@ public class StringEscapeUtilsTest extends TestCase { } } - public void testHtmlunescape() { + public void testUnescapeHtml() { for (int i = 0; i < htmlEscapes.length; ++i) { assertEquals(htmlEscapes[i][0], htmlEscapes[i][2], StringEscapeUtils.unescapeHtml(htmlEscapes[i][1])); // todo: add test for (and implement) Writer-based version @@ -191,6 +201,25 @@ public class StringEscapeUtilsTest extends TestCase { assertEquals("funny chars pass through OK", "Fran\u00E7ais", StringEscapeUtils.unescapeHtml("Fran\u00E7ais")); } + public void testEscapeHtmlVersions() throws Exception + { + assertEquals("Β", StringEscapeUtils.escapeHtml("\u0392")); + assertEquals("\u0392", StringEscapeUtils.unescapeHtml("Β")); + + //todo: refine API for escaping/unescaping specific HTML versions + + } + + public void testEscapeXml() throws Exception { + assertEquals("<abc>", StringEscapeUtils.escapeXml("")); + assertEquals("", StringEscapeUtils.unescapeXml("<abc>")); + + assertEquals("XML should use numbers, not names for HTML entities", + "¡", StringEscapeUtils.escapeXml("\u00A1")); + assertEquals("XML should use numbers, not names for HTML entities", + "\u00A0", StringEscapeUtils.unescapeXml(" ")); + } + // SQL // see http://www.jguru.com/faq/view.jsp?EID=8881 //--------------------