Extracted Entities class (instead of inner class)

Added support for full HTML 4.0 entity set Separated XML, ISO8859-1 (HTML 3.2), and HTML 4.0 entities inside Entities object Added escapeXml and unescapeXml public API methods Escape now uses entities for values 0x80 or greater (previously 0x100) git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@137292 13f79535-47bb-0310-9956-ffa450edef68
2003-04-09 20:55:33 +00:00 · 2003-04-09 20:55:33 +00:00 · 69cc0e40e6
parent 6af3b80369
commit 69cc0e40e6
3 changed files with 513 additions and 164 deletions
--- a/src/java/org/apache/commons/lang/Entities.java
+++ b/src/java/org/apache/commons/lang/Entities.java
@ -0,0 +1,417 @@
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2002-2003 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution, if
+ *    any, must include the following acknowlegement:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowlegement may appear in the software itself,
+ *    if and wherever such third-party acknowlegements normally appear.
+ *
+ * 4. The names "The Jakarta Project", "Commons", and "Apache Software
+ *    Foundation" must not be used to endorse or promote products derived
+ *    from this software without prior written permission. For written
+ *    permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache"
+ *    nor may "Apache" appear in their names without prior written
+ *    permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+package org.apache.commons.lang;
+
+import java.util.Map;
+import java.util.HashMap;
+import java.io.IOException;
+import java.io.Writer;
+
+import org.apache.commons.lang.exception.NestableRuntimeException;
+
+//todo: unit test and make public
+
+/**
+ * <p>HTML and XML entity utility class</p>
+ *
+ * <p>See
+ * <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">WebMonkey</a>
+ * <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2</a>
+ * <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html</a>HTML 4.0</a>
+ * </p>
+ *
+ * @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a>
+ * @since 2.0
+ * @version $Id: Entities.java,v 1.1 2003/04/09 20:55:32 alex Exp $
+ */
+class Entities {
+
+    static private Object[][] basic = {
+        {"quot", "34"}, // " - double-quote
+        {"amp", "38"}, // & - ampersand
+        {"lt", "60"}, // < - less-than
+        {"gt", "62"}, // > - greater-than
+    };
+
+    static private Object[][] iso8859_1 = {
+        {"nbsp", "160"}, // non-breaking space
+        {"iexcl", "161"}, //inverted exclamation mark
+        {"cent", "162"}, //cent sign
+        {"pound", "163"}, //pound sign
+        {"curren", "164"}, //currency sign
+        {"yen", "165"}, //yen sign = yuan sign
+        {"brvbar", "166"}, //broken bar = broken vertical bar
+        {"sect", "167"}, //section sign
+        {"uml", "168"}, //diaeresis = spacing diaeresis
+        {"copy", "169"}, // © - copyright sign
+        {"ordf", "170"}, //feminine ordinal indicator
+        {"laquo", "171"}, //left-pointing double angle quotation mark = left pointing guillemet
+        {"not", "172"}, //not sign
+        {"shy", "173"}, //soft hyphen = discretionary hyphen
+        {"reg", "174"}, // ® - registered trademark sign
+        {"macr", "175"}, //macron = spacing macron = overline = APL overbar
+        {"deg", "176"}, //degree sign
+        {"plusmn", "177"}, //plus-minus sign = plus-or-minus sign
+        {"sup2", "178"}, //superscript two = superscript digit two = squared
+        {"sup3", "179"}, //superscript three = superscript digit three = cubed
+        {"acute", "180"}, //acute accent = spacing acute
+        {"micro", "181"}, //micro sign
+        {"para", "182"}, //pilcrow sign = paragraph sign
+        {"middot", "183"}, //middle dot = Georgian comma = Greek middle dot
+        {"cedil", "184"}, //cedilla = spacing cedilla
+        {"sup1", "185"}, //superscript one = superscript digit one
+        {"ordm", "186"}, //masculine ordinal indicator
+        {"raquo", "187"}, //right-pointing double angle quotation mark = right pointing guillemet
+        {"frac14", "188"}, //vulgar fraction one quarter = fraction one quarter
+        {"frac12", "189"}, //vulgar fraction one half = fraction one half
+        {"frac34", "190"}, //vulgar fraction three quarters = fraction three quarters
+        {"iquest", "191"}, //inverted question mark = turned question mark
+        {"Agrave", "192"}, // À - uppercase A, grave accent
+        {"Aacute", "193"}, // Á - uppercase A, acute accent
+        {"Acirc", "194"}, // Â - uppercase A, circumflex accent
+        {"Atilde", "195"}, // Ã - uppercase A, tilde
+        {"Auml", "196"}, // Ä - uppercase A, umlaut
+        {"Aring", "197"}, // Å - uppercase A, ring
+        {"AElig", "198"}, // Æ - uppercase AE
+        {"Ccedil", "199"}, // Ç - uppercase C, cedilla
+        {"Egrave", "200"}, // È - uppercase E, grave accent
+        {"Eacute", "201"}, // É - uppercase E, acute accent
+        {"Ecirc", "202"}, // Ê - uppercase E, circumflex accent
+        {"Euml", "203"}, // Ë - uppercase E, umlaut
+        {"Igrave", "204"}, // Ì - uppercase I, grave accent
+        {"Iacute", "205"}, // Í - uppercase I, acute accent
+        {"Icirc", "206"}, // Î - uppercase I, circumflex accent
+        {"Iuml", "207"}, // Ï - uppercase I, umlaut
+        {"ETH", "208"}, // Ð - uppercase Eth, Icelandic
+        {"Ntilde", "209"}, // Ñ - uppercase N, tilde
+        {"Ograve", "210"}, // Ò - uppercase O, grave accent
+        {"Oacute", "211"}, // Ó - uppercase O, acute accent
+        {"Ocirc", "212"}, // Ô - uppercase O, circumflex accent
+        {"Otilde", "213"}, // Õ - uppercase O, tilde
+        {"Ouml", "214"}, // Ö - uppercase O, umlaut
+        {"times", "215"}, //multiplication sign
+        {"Oslash", "216"}, // Ø - uppercase O, slash
+        {"Ugrave", "217"}, // Ù - uppercase U, grave accent
+        {"Uacute", "218"}, // Ú - uppercase U, acute accent
+        {"Ucirc", "219"}, // Û - uppercase U, circumflex accent
+        {"Uuml", "220"}, // Ü - uppercase U, umlaut
+        {"Yacute", "221"}, // Ý - uppercase Y, acute accent
+        {"THORN", "222"}, // Þ - uppercase THORN, Icelandic
+        {"szlig", "223"}, // ß - lowercase sharps, German
+        {"agrave", "224"}, // à - lowercase a, grave accent
+        {"aacute", "225"}, // á - lowercase a, acute accent
+        {"acirc", "226"}, // â - lowercase a, circumflex accent
+        {"atilde", "227"}, // ã - lowercase a, tilde
+        {"auml", "228"}, // ä - lowercase a, umlaut
+        {"aring", "229"}, // å - lowercase a, ring
+        {"aelig", "230"}, // æ - lowercase ae
+        {"ccedil", "231"}, // ç - lowercase c, cedilla
+        {"egrave", "232"}, // è - lowercase e, grave accent
+        {"eacute", "233"}, // é - lowercase e, acute accent
+        {"ecirc", "234"}, // ê - lowercase e, circumflex accent
+        {"euml", "235"}, // ë - lowercase e, umlaut
+        {"igrave", "236"}, // ì - lowercase i, grave accent
+        {"iacute", "237"}, // í - lowercase i, acute accent
+        {"icirc", "238"}, // î - lowercase i, circumflex accent
+        {"iuml", "239"}, // ï - lowercase i, umlaut
+        {"eth", "240"}, // ð - lowercase eth, Icelandic
+        {"ntilde", "241"}, // ñ - lowercase n, tilde
+        {"ograve", "242"}, // ò - lowercase o, grave accent
+        {"oacute", "243"}, // ó - lowercase o, acute accent
+        {"ocirc", "244"}, // ô - lowercase o, circumflex accent
+        {"otilde", "245"}, // õ - lowercase o, tilde
+        {"ouml", "246"}, // ö - lowercase o, umlaut
+        {"divide", "247"}, // division sign
+        {"oslash", "248"}, // ø - lowercase o, slash
+        {"ugrave", "249"}, // ù - lowercase u, grave accent
+        {"uacute", "250"}, // ú - lowercase u, acute accent
+        {"ucirc", "251"}, // û - lowercase u, circumflex accent
+        {"uuml", "252"}, // ü - lowercase u, umlaut
+        {"yacute", "253"}, // ý - lowercase y, acute accent
+        {"thorn", "254"}, // þ - lowercase thorn, Icelandic
+        {"yuml", "255"}, // ÿ - lowercase y, umlaut
+    };
+
+    // http://www.w3.org/TR/REC-html40/sgml/entities.html
+    static Object[][] html40 = {
+// <!-- Latin Extended-B -->
+        {"fnof", "402"},  //latin small f with hook = function= florin, U+0192 ISOtech -->
+// <!-- Greek -->
+        {"Alpha", "913"},  //greek capital letter alpha, U+0391 -->
+        {"Beta", "914"},  //greek capital letter beta, U+0392 -->
+        {"Gamma", "915"},  //greek capital letter gamma,U+0393 ISOgrk3 -->
+        {"Delta", "916"},  //greek capital letter delta,U+0394 ISOgrk3 -->
+        {"Epsilon", "917"},  //greek capital letter epsilon, U+0395 -->
+        {"Zeta", "918"},  //greek capital letter zeta, U+0396 -->
+        {"Eta", "919"},  //greek capital letter eta, U+0397 -->
+        {"Theta", "920"},  //greek capital letter theta,U+0398 ISOgrk3 -->
+        {"Iota", "921"},  //greek capital letter iota, U+0399 -->
+        {"Kappa", "922"},  //greek capital letter kappa, U+039A -->
+        {"Lambda", "923"},  //greek capital letter lambda,U+039B ISOgrk3 -->
+        {"Mu", "924"},  //greek capital letter mu, U+039C -->
+        {"Nu", "925"},  //greek capital letter nu, U+039D -->
+        {"Xi", "926"},  //greek capital letter xi, U+039E ISOgrk3 -->
+        {"Omicron", "927"},  //greek capital letter omicron, U+039F -->
+        {"Pi", "928"},  //greek capital letter pi, U+03A0 ISOgrk3 -->
+        {"Rho", "929"},  //greek capital letter rho, U+03A1 -->
+// <!-- there is no Sigmaf, and no U+03A2 character either -->
+        {"Sigma", "931"},  //greek capital letter sigma,U+03A3 ISOgrk3 -->
+        {"Tau", "932"},  //greek capital letter tau, U+03A4 -->
+        {"Upsilon", "933"},  //greek capital letter upsilon,U+03A5 ISOgrk3 -->
+        {"Phi", "934"},  //greek capital letter phi,U+03A6 ISOgrk3 -->
+        {"Chi", "935"},  //greek capital letter chi, U+03A7 -->
+        {"Psi", "936"},  //greek capital letter psi,U+03A8 ISOgrk3 -->
+        {"Omega", "937"},  //greek capital letter omega,U+03A9 ISOgrk3 -->
+        {"alpha", "945"},  //greek small letter alpha,U+03B1 ISOgrk3 -->
+        {"beta", "946"},  //greek small letter beta, U+03B2 ISOgrk3 -->
+        {"gamma", "947"},  //greek small letter gamma,U+03B3 ISOgrk3 -->
+        {"delta", "948"},  //greek small letter delta,U+03B4 ISOgrk3 -->
+        {"epsilon", "949"},  //greek small letter epsilon,U+03B5 ISOgrk3 -->
+        {"zeta", "950"},  //greek small letter zeta, U+03B6 ISOgrk3 -->
+        {"eta", "951"},  //greek small letter eta, U+03B7 ISOgrk3 -->
+        {"theta", "952"},  //greek small letter theta,U+03B8 ISOgrk3 -->
+        {"iota", "953"},  //greek small letter iota, U+03B9 ISOgrk3 -->
+        {"kappa", "954"},  //greek small letter kappa,U+03BA ISOgrk3 -->
+        {"lambda", "955"},  //greek small letter lambda,U+03BB ISOgrk3 -->
+        {"mu", "956"},  //greek small letter mu, U+03BC ISOgrk3 -->
+        {"nu", "957"},  //greek small letter nu, U+03BD ISOgrk3 -->
+        {"xi", "958"},  //greek small letter xi, U+03BE ISOgrk3 -->
+        {"omicron", "959"},  //greek small letter omicron, U+03BF NEW -->
+        {"pi", "960"},  //greek small letter pi, U+03C0 ISOgrk3 -->
+        {"rho", "961"},  //greek small letter rho, U+03C1 ISOgrk3 -->
+        {"sigmaf", "962"},  //greek small letter final sigma,U+03C2 ISOgrk3 -->
+        {"sigma", "963"},  //greek small letter sigma,U+03C3 ISOgrk3 -->
+        {"tau", "964"},  //greek small letter tau, U+03C4 ISOgrk3 -->
+        {"upsilon", "965"},  //greek small letter upsilon,U+03C5 ISOgrk3 -->
+        {"phi", "966"},  //greek small letter phi, U+03C6 ISOgrk3 -->
+        {"chi", "967"},  //greek small letter chi, U+03C7 ISOgrk3 -->
+        {"psi", "968"},  //greek small letter psi, U+03C8 ISOgrk3 -->
+        {"omega", "969"},  //greek small letter omega,U+03C9 ISOgrk3 -->
+        {"thetasym", "977"},  //greek small letter theta symbol,U+03D1 NEW -->
+        {"upsih", "978"},  //greek upsilon with hook symbol,U+03D2 NEW -->
+        {"piv", "982"},  //greek pi symbol, U+03D6 ISOgrk3 -->
+// <!-- General Punctuation -->
+        {"bull", "8226"},  //bullet = black small circle,U+2022 ISOpub  -->
+// <!-- bullet is NOT the same as bullet operator, U+2219 -->
+        {"hellip", "8230"},  //horizontal ellipsis = three dot leader,U+2026 ISOpub  -->
+        {"prime", "8242"},  //prime = minutes = feet, U+2032 ISOtech -->
+        {"Prime", "8243"},  //double prime = seconds = inches,U+2033 ISOtech -->
+        {"oline", "8254"},  //overline = spacing overscore,U+203E NEW -->
+        {"frasl", "8260"},  //fraction slash, U+2044 NEW -->
+// <!-- Letterlike Symbols -->
+        {"weierp", "8472"},  //script capital P = power set= Weierstrass p, U+2118 ISOamso -->
+        {"image", "8465"},  //blackletter capital I = imaginary part,U+2111 ISOamso -->
+        {"real", "8476"},  //blackletter capital R = real part symbol,U+211C ISOamso -->
+        {"trade", "8482"},  //trade mark sign, U+2122 ISOnum -->
+        {"alefsym", "8501"},  //alef symbol = first transfinite cardinal,U+2135 NEW -->
+// <!-- alef symbol is NOT the same as hebrew letter alef,U+05D0 although the same glyph could be used to depict both characters -->
+// <!-- Arrows -->
+        {"larr", "8592"},  //leftwards arrow, U+2190 ISOnum -->
+        {"uarr", "8593"},  //upwards arrow, U+2191 ISOnum-->
+        {"rarr", "8594"},  //rightwards arrow, U+2192 ISOnum -->
+        {"darr", "8595"},  //downwards arrow, U+2193 ISOnum -->
+        {"harr", "8596"},  //left right arrow, U+2194 ISOamsa -->
+        {"crarr", "8629"},  //downwards arrow with corner leftwards= carriage return, U+21B5 NEW -->
+        {"lArr", "8656"},  //leftwards double arrow, U+21D0 ISOtech -->
+// <!-- ISO 10646 does not say that lArr is the same as the 'is implied by' arrowbut also does not have any other character for that function. So ? lArr canbe used for 'is implied by' as ISOtech suggests -->
+        {"uArr", "8657"},  //upwards double arrow, U+21D1 ISOamsa -->
+        {"rArr", "8658"},  //rightwards double arrow,U+21D2 ISOtech -->
+// <!-- ISO 10646 does not say this is the 'implies' character but does not have another character with this function so ?rArr can be used for 'implies' as ISOtech suggests -->
+        {"dArr", "8659"},  //downwards double arrow, U+21D3 ISOamsa -->
+        {"hArr", "8660"},  //left right double arrow,U+21D4 ISOamsa -->
+// <!-- Mathematical Operators -->
+        {"forall", "8704"},  //for all, U+2200 ISOtech -->
+        {"part", "8706"},  //partial differential, U+2202 ISOtech  -->
+        {"exist", "8707"},  //there exists, U+2203 ISOtech -->
+        {"empty", "8709"},  //empty set = null set = diameter,U+2205 ISOamso -->
+        {"nabla", "8711"},  //nabla = backward difference,U+2207 ISOtech -->
+        {"isin", "8712"},  //element of, U+2208 ISOtech -->
+        {"notin", "8713"},  //not an element of, U+2209 ISOtech -->
+        {"ni", "8715"},  //contains as member, U+220B ISOtech -->
+// <!-- should there be a more memorable name than 'ni'? -->
+        {"prod", "8719"},  //n-ary product = product sign,U+220F ISOamsb -->
+// <!-- prod is NOT the same character as U+03A0 'greek capital letter pi' thoughthe same glyph might be used for both -->
+        {"sum", "8721"},  //n-ary sumation, U+2211 ISOamsb -->
+// <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'though the same glyph might be used for both -->
+        {"minus", "8722"},  //minus sign, U+2212 ISOtech -->
+        {"lowast", "8727"},  //asterisk operator, U+2217 ISOtech -->
+        {"radic", "8730"},  //square root = radical sign,U+221A ISOtech -->
+        {"prop", "8733"},  //proportional to, U+221D ISOtech -->
+        {"infin", "8734"},  //infinity, U+221E ISOtech -->
+        {"ang", "8736"},  //angle, U+2220 ISOamso -->
+        {"and", "8743"},  //logical and = wedge, U+2227 ISOtech -->
+        {"or", "8744"},  //logical or = vee, U+2228 ISOtech -->
+        {"cap", "8745"},  //intersection = cap, U+2229 ISOtech -->
+        {"cup", "8746"},  //union = cup, U+222A ISOtech -->
+        {"int", "8747"},  //integral, U+222B ISOtech -->
+        {"there4", "8756"},  //therefore, U+2234 ISOtech -->
+        {"sim", "8764"},  //tilde operator = varies with = similar to,U+223C ISOtech -->
+// <!-- tilde operator is NOT the same character as the tilde, U+007E,although the same glyph might be used to represent both  -->
+        {"cong", "8773"},  //approximately equal to, U+2245 ISOtech -->
+        {"asymp", "8776"},  //almost equal to = asymptotic to,U+2248 ISOamsr -->
+        {"ne", "8800"},  //not equal to, U+2260 ISOtech -->
+        {"equiv", "8801"},  //identical to, U+2261 ISOtech -->
+        {"le", "8804"},  //less-than or equal to, U+2264 ISOtech -->
+        {"ge", "8805"},  //greater-than or equal to,U+2265 ISOtech -->
+        {"sub", "8834"},  //subset of, U+2282 ISOtech -->
+        {"sup", "8835"},  //superset of, U+2283 ISOtech -->
+// <!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol font encoding and is not included. Should it be, for symmetry?It is in ISOamsn  --> <!ENTITY nsub", "8836"},  //not a subset of, U+2284 ISOamsn -->
+        {"sube", "8838"},  //subset of or equal to, U+2286 ISOtech -->
+        {"supe", "8839"},  //superset of or equal to,U+2287 ISOtech -->
+        {"oplus", "8853"},  //circled plus = direct sum,U+2295 ISOamsb -->
+        {"otimes", "8855"},  //circled times = vector product,U+2297 ISOamsb -->
+        {"perp", "8869"},  //up tack = orthogonal to = perpendicular,U+22A5 ISOtech -->
+        {"sdot", "8901"},  //dot operator, U+22C5 ISOamsb -->
+// <!-- dot operator is NOT the same character as U+00B7 middle dot -->
+// <!-- Miscellaneous Technical -->
+        {"lceil", "8968"},  //left ceiling = apl upstile,U+2308 ISOamsc  -->
+        {"rceil", "8969"},  //right ceiling, U+2309 ISOamsc  -->
+        {"lfloor", "8970"},  //left floor = apl downstile,U+230A ISOamsc  -->
+        {"rfloor", "8971"},  //right floor, U+230B ISOamsc  -->
+        {"lang", "9001"},  //left-pointing angle bracket = bra,U+2329 ISOtech -->
+// <!-- lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation mark' -->
+        {"rang", "9002"},  //right-pointing angle bracket = ket,U+232A ISOtech -->
+// <!-- rang is NOT the same character as U+003E 'greater than' or U+203A 'single right-pointing angle quotation mark' -->
+// <!-- Geometric Shapes -->
+        {"loz", "9674"},  //lozenge, U+25CA ISOpub -->
+// <!-- Miscellaneous Symbols -->
+        {"spades", "9824"},  //black spade suit, U+2660 ISOpub -->
+// <!-- black here seems to mean filled as opposed to hollow -->
+        {"clubs", "9827"},  //black club suit = shamrock,U+2663 ISOpub -->
+        {"hearts", "9829"},  //black heart suit = valentine,U+2665 ISOpub -->
+        {"diams", "9830"},  //black diamond suit, U+2666 ISOpub -->
+
+// <!-- Latin Extended-A -->
+        {"OElig", "338"},  //  -- latin capital ligature OE,U+0152 ISOlat2 -->
+        {"oelig", "339"},  //  -- latin small ligature oe, U+0153 ISOlat2 -->
+// <!-- ligature is a misnomer, this is a separate character in some languages -->
+        {"Scaron", "352"},  //  -- latin capital letter S with caron,U+0160 ISOlat2 -->
+        {"scaron", "353"},  //  -- latin small letter s with caron,U+0161 ISOlat2 -->
+        {"Yuml", "376"},  //  -- latin capital letter Y with diaeresis,U+0178 ISOlat2 -->
+// <!-- Spacing Modifier Letters -->
+        {"circ", "710"},  //  -- modifier letter circumflex accent,U+02C6 ISOpub -->
+        {"tilde", "732"},  //small tilde, U+02DC ISOdia -->
+// <!-- General Punctuation -->
+        {"ensp", "8194"},  //en space, U+2002 ISOpub -->
+        {"emsp", "8195"},  //em space, U+2003 ISOpub -->
+        {"thinsp", "8201"},  //thin space, U+2009 ISOpub -->
+        {"zwnj", "8204"},  //zero width non-joiner,U+200C NEW RFC 2070 -->
+        {"zwj", "8205"},  //zero width joiner, U+200D NEW RFC 2070 -->
+        {"lrm", "8206"},  //left-to-right mark, U+200E NEW RFC 2070 -->
+        {"rlm", "8207"},  //right-to-left mark, U+200F NEW RFC 2070 -->
+        {"ndash", "8211"},  //en dash, U+2013 ISOpub -->
+        {"mdash", "8212"},  //em dash, U+2014 ISOpub -->
+        {"lsquo", "8216"},  //left single quotation mark,U+2018 ISOnum -->
+        {"rsquo", "8217"},  //right single quotation mark,U+2019 ISOnum -->
+        {"sbquo", "8218"},  //single low-9 quotation mark, U+201A NEW -->
+        {"ldquo", "8220"},  //left double quotation mark,U+201C ISOnum -->
+        {"rdquo", "8221"},  //right double quotation mark,U+201D ISOnum -->
+        {"bdquo", "8222"},  //double low-9 quotation mark, U+201E NEW -->
+        {"dagger", "8224"},  //dagger, U+2020 ISOpub -->
+        {"Dagger", "8225"},  //double dagger, U+2021 ISOpub -->
+        {"permil", "8240"},  //per mille sign, U+2030 ISOtech -->
+        {"lsaquo", "8249"},  //single left-pointing angle quotation mark,U+2039 ISO proposed -->
+// <!-- lsaquo is proposed but not yet ISO standardized -->
+        {"rsaquo", "8250"},  //single right-pointing angle quotation mark,U+203A ISO proposed -->
+// <!-- rsaquo is proposed but not yet ISO standardized -->
+        {"euro", "8364"},   //  -- euro sign, U+20AC NEW -->
+     };
+
+    public static Entities XML;
+    public static Entities HTML32;
+    public static Entities HTML40;
+
+    static {
+        XML = new Entities();
+        XML.addEntities(basic);
+    }
+
+    static {
+        HTML32 = new Entities();
+        HTML32.addEntities(basic);
+        HTML32.addEntities(iso8859_1);
+    }
+
+    static {
+        HTML40 = new Entities();
+        HTML40.addEntities(basic);
+        HTML40.addEntities(iso8859_1);
+        HTML40.addEntities(html40);
+    }
+
+    //todo: refactor into a bi-di map object (or look for one and use it)
+    private Map mapNameToValue;
+    private Map mapValueToName;
+
+    private Entities() {
+        mapNameToValue = new HashMap();
+        mapValueToName = new HashMap();
+    }
+
+    private void addEntities(Object[][] entityArray) {
+        //todo: analyze whether it's more efficient to use strings or integers as the value
+        for (int i = 0; i < entityArray.length; ++i) {
+            mapNameToValue.put(entityArray[i][0], new Integer((String) entityArray[i][1]));
+            mapValueToName.put(new Integer((String) entityArray[i][1]), entityArray[i][0]);
+        }
+    }
+
+    public String entityName(int value) {
+        return (String) mapValueToName.get(new Integer(value));
+    }
+
+    public Integer entityValue(String name) {
+        return (Integer) mapNameToValue.get(name);
+    }
+}
--- a/src/java/org/apache/commons/lang/StringEscapeUtils.java
+++ b/src/java/org/apache/commons/lang/StringEscapeUtils.java
@ -75,10 +75,15 @@ import org.apache.commons.lang.exception.NestableRuntimeException;
 * @author Helge Tesgaard
 * @author <a href="sean@boohai.com">Sean Brown</a>
 * @since 2.0
- * @version $Id: StringEscapeUtils.java,v 1.5 2003/04/09 18:45:28 alex Exp $
+ * @version $Id: StringEscapeUtils.java,v 1.6 2003/04/09 20:55:32 alex Exp $
 */
 public class StringEscapeUtils {

+    /**
+     * The entity set to use when escaping and unescaping HTML
+     */
+    protected static Entities DEFAULT_ENTITIES = Entities.HTML40;
+
    /**
     * <p><code>StringEscapeUtils</code> instances should NOT be constructed in
     * standard programming. Instead, the class should be used as
@ -351,167 +356,72 @@ public class StringEscapeUtils {
    // HTML and XML
    //--------------------------------------------------------------------------

-    private static class Entities {
-        // see http://hotwired.lycos.com/webmonkey/reference/special_characters/
-        // see http://www.w3.org/TR/REC-html40/sgml/entities.html
-        static Object[][] entities = {
-            // {"#39", new Integer(39)},       // ' - apostrophe
-            {"quot", new Integer(34)}, // " - double-quote
-            {"amp", new Integer(38)}, // & - ampersand
-            {"lt", new Integer(60)}, // < - less-than
-            {"gt", new Integer(62)}, // > - greater-than
-            {"nbsp", new Integer(160)}, // non-breaking space
-            {"iexcl", new Integer(161)}, //inverted exclamation mark
-            {"cent", new Integer(162)}, //cent sign
-            {"pound", new Integer(163)}, //pound sign
-            {"curren", new Integer(164)}, //currency sign
-            {"yen", new Integer(165)}, //yen sign = yuan sign
-            {"brvbar", new Integer(166)}, //broken bar = broken vertical bar
-            {"sect", new Integer(167)}, //section sign
-            {"uml", new Integer(168)}, //diaeresis = spacing diaeresis
-            {"copy", new Integer(169)}, // © - copyright sign
-            {"ordf", new Integer(170)}, //feminine ordinal indicator
-            {"laquo", new Integer(171)}, //left-pointing double angle quotation mark = left pointing guillemet
-            {"not", new Integer(172)}, //not sign
-            {"shy", new Integer(173)}, //soft hyphen = discretionary hyphen
-            {"reg", new Integer(174)}, // ® - registered trademark sign
-            {"macr", new Integer(175)}, //macron = spacing macron = overline = APL overbar
-            {"deg", new Integer(176)}, //degree sign
-            {"plusmn", new Integer(177)}, //plus-minus sign = plus-or-minus sign
-            {"sup2", new Integer(178)}, //superscript two = superscript digit two = squared
-            {"sup3", new Integer(179)}, //superscript three = superscript digit three = cubed
-            {"acute", new Integer(180)}, //acute accent = spacing acute
-            {"micro", new Integer(181)}, //micro sign
-            {"para", new Integer(182)}, //pilcrow sign = paragraph sign
-            {"middot", new Integer(183)}, //middle dot = Georgian comma = Greek middle dot
-            {"cedil", new Integer(184)}, //cedilla = spacing cedilla
-            {"sup1", new Integer(185)}, //superscript one = superscript digit one
-            {"ordm", new Integer(186)}, //masculine ordinal indicator
-            {"raquo", new Integer(187)}, //right-pointing double angle quotation mark = right pointing guillemet
-            {"frac14", new Integer(188)}, //vulgar fraction one quarter = fraction one quarter
-            {"frac12", new Integer(189)}, //vulgar fraction one half = fraction one half
-            {"frac34", new Integer(190)}, //vulgar fraction three quarters = fraction three quarters
-            {"iquest", new Integer(191)}, //inverted question mark = turned question mark
-            {"Agrave", new Integer(192)}, // À - uppercase A, grave accent
-            {"Aacute", new Integer(193)}, // Á - uppercase A, acute accent
-            {"Acirc", new Integer(194)}, // Â - uppercase A, circumflex accent
-            {"Atilde", new Integer(195)}, // Ã - uppercase A, tilde
-            {"Auml", new Integer(196)}, // Ä - uppercase A, umlaut
-            {"Aring", new Integer(197)}, // Å - uppercase A, ring
-            {"AElig", new Integer(198)}, // Æ - uppercase AE
-            {"Ccedil", new Integer(199)}, // Ç - uppercase C, cedilla
-            {"Egrave", new Integer(200)}, // È - uppercase E, grave accent
-            {"Eacute", new Integer(201)}, // É - uppercase E, acute accent
-            {"Ecirc", new Integer(202)}, // Ê - uppercase E, circumflex accent
-            {"Euml", new Integer(203)}, // Ë - uppercase E, umlaut
-            {"Igrave", new Integer(204)}, // Ì - uppercase I, grave accent
-            {"Iacute", new Integer(205)}, // Í - uppercase I, acute accent
-            {"Icirc", new Integer(206)}, // Î - uppercase I, circumflex accent
-            {"Iuml", new Integer(207)}, // Ï - uppercase I, umlaut
-            {"ETH", new Integer(208)}, // Ð - uppercase Eth, Icelandic
-            {"Ntilde", new Integer(209)}, // Ñ - uppercase N, tilde
-            {"Ograve", new Integer(210)}, // Ò - uppercase O, grave accent
-            {"Oacute", new Integer(211)}, // Ó - uppercase O, acute accent
-            {"Ocirc", new Integer(212)}, // Ô - uppercase O, circumflex accent
-            {"Otilde", new Integer(213)}, // Õ - uppercase O, tilde
-            {"Ouml", new Integer(214)}, // Ö - uppercase O, umlaut
-            {"times", new Integer(215)}, //multiplication sign
-            {"Oslash", new Integer(216)}, // Ø - uppercase O, slash
-            {"Ugrave", new Integer(217)}, // Ù - uppercase U, grave accent
-            {"Uacute", new Integer(218)}, // Ú - uppercase U, acute accent
-            {"Ucirc", new Integer(219)}, // Û - uppercase U, circumflex accent
-            {"Uuml", new Integer(220)}, // Ü - uppercase U, umlaut
-            {"Yacute", new Integer(221)}, // Ý - uppercase Y, acute accent
-            {"THORN", new Integer(222)}, // Þ - uppercase THORN, Icelandic
-            {"szlig", new Integer(223)}, // ß - lowercase sharps, German
-            {"agrave", new Integer(224)}, // à - lowercase a, grave accent
-            {"aacute", new Integer(225)}, // á - lowercase a, acute accent
-            {"acirc", new Integer(226)}, // â - lowercase a, circumflex accent
-            {"atilde", new Integer(227)}, // ã - lowercase a, tilde
-            {"auml", new Integer(228)}, // ä - lowercase a, umlaut
-            {"aring", new Integer(229)}, // å - lowercase a, ring
-            {"aelig", new Integer(230)}, // æ - lowercase ae
-            {"ccedil", new Integer(231)}, // ç - lowercase c, cedilla
-            {"egrave", new Integer(232)}, // è - lowercase e, grave accent
-            {"eacute", new Integer(233)}, // é - lowercase e, acute accent
-            {"ecirc", new Integer(234)}, // ê - lowercase e, circumflex accent
-            {"euml", new Integer(235)}, // ë - lowercase e, umlaut
-            {"igrave", new Integer(236)}, // ì - lowercase i, grave accent
-            {"iacute", new Integer(237)}, // í - lowercase i, acute accent
-            {"icirc", new Integer(238)}, // î - lowercase i, circumflex accent
-            {"iuml", new Integer(239)}, // ï - lowercase i, umlaut
-            {"eth", new Integer(240)}, // ð - lowercase eth, Icelandic
-            {"ntilde", new Integer(241)}, // ñ - lowercase n, tilde
-            {"ograve", new Integer(242)}, // ò - lowercase o, grave accent
-            {"oacute", new Integer(243)}, // ó - lowercase o, acute accent
-            {"ocirc", new Integer(244)}, // ô - lowercase o, circumflex accent
-            {"otilde", new Integer(245)}, // õ - lowercase o, tilde
-            {"ouml", new Integer(246)}, // ö - lowercase o, umlaut
-            {"divide", new Integer(247)}, // division sign
-            {"oslash", new Integer(248)}, // ø - lowercase o, slash
-            {"ugrave", new Integer(249)}, // ù - lowercase u, grave accent
-            {"uacute", new Integer(250)}, // ú - lowercase u, acute accent
-            {"ucirc", new Integer(251)}, // û - lowercase u, circumflex accent
-            {"uuml", new Integer(252)}, // ü - lowercase u, umlaut
-            {"yacute", new Integer(253)}, // ý - lowercase y, acute accent
-            {"thorn", new Integer(254)}, // þ - lowercase thorn, Icelandic
-            {"yuml", new Integer(255)}, // ÿ - lowercase y, umlaut
-            {"euro", new Integer(8364)}, // Euro symbol
-        };
-
-        private Map mapNameToValue;
-        private Map mapValueToName;
-
-        public Entities() {
-            mapNameToValue = new HashMap();
-            mapValueToName = new HashMap();
-            for (int i = 0; i < entities.length; ++i) {
-                mapNameToValue.put(entities[i][0], entities[i][1]);
-                mapValueToName.put(entities[i][1], entities[i][0]);
-            }
-        }
-
-        public String entityName(int value) {
-            return (String) mapValueToName.get(new Integer(value));
-        }
-
-        public Integer entityValue(String name) {
-            return (Integer) mapNameToValue.get(name);
-        }
-    }
-
-    private static Entities entities;
-
-    private static void initEntities() {
-        if (entities == null)
-            entities = new Entities();
-    }
-
-    private static String entityName(char ch) {
-        initEntities();
-        return entities.entityName(ch);
-    }
-
-    private static Integer entityValue(String entity) {
-        initEntities();
-        return entities.entityValue(entity);
-    }
-
    /**
     * <p>Turns funky characters into HTML entity equivalents.</p>
+     * <p>
     * For example: <tt>"bread" & "butter"</tt> => <tt>&amp;quot;bread&amp;quot; &amp;amp; &amp;quot;butter&amp;quot;</tt>.
-     * Supports all known HTML entities, including funky accents. See the source code for more detail.
-     * see http://hotwired.lycos.com/webmonkey/reference/special_characters/
+     * </p>
+     * <p>Supports all known HTML 4.0 entities, including funky accents.
+     * See the source code for more detail.
+     * </p>
+     * @see http://hotwired.lycos.com/webmonkey/reference/special_characters/
+     * @see Entities
     * @see #unescapeHtml(String)
     **/
    public static String escapeHtml(String str) {
+        return escapeEntities(str, Entities.HTML40);
+    }
+
+    /**
+     * Given a string containing entity escapes, returns a string
+     * containing the actual Unicode characters corresponding to the
+     * escapes.
+     *
+     * @see #escapeHtml(String)
+     **/
+    public static String unescapeHtml(String str) {
+        return unescapeEntities(str, Entities.HTML40);
+    }
+
+    /**
+     * <p>Turns funky characters into XML entity equivalents.</p>
+     * <p>
+     * For example: <tt>"bread" & "butter"</tt> => <tt>&amp;quot;bread&amp;quot; &amp;amp; &amp;quot;butter&amp;quot;</tt>.
+     * </p>
+     * <p>
+     * Supports only the four basic XML entities (gt, lt, quot, amp).
+     * Does not support DTDs or external entities.
+     * </p>
+     * @see #unescapeXml(java.lang.String)
+     **/
+    public static String escapeXml(String str) {
+        return escapeEntities(str, Entities.XML);
+    }
+
+    /**
+     * <p>Given a string containing XML entity escapes, returns a string
+     * containing the actual Unicode characters corresponding to the
+     * escapes.
+     * </p>
+     * <p>
+     * Supports only the four basic XML entities (gt, lt, quot, amp).
+     * Does not support DTDs or external entities.
+     * </p>
+     *
+     * @see #escapeXml(String)
+     **/
+    public static String unescapeXml(String str) {
+        return unescapeEntities(str, Entities.XML);
+    }
+
+    private static String escapeEntities(String str, Entities entities) {
        StringBuffer buf = new StringBuffer(str.length() * 2);
        int i;
        for (i = 0; i < str.length(); ++i) {
            char ch = str.charAt(i);
-            String entity = entityName(ch);
+            String entity = entities.entityName(ch);
            if (entity == null) {
-                if (((int) ch) > 128) {   // should this be 127 or 128?
+                if (((int) ch) > 0x7F) {
                    int intValue = ((int) ch);
                    buf.append("&#" + intValue + ";");
                } else {
@ -524,14 +434,7 @@ public class StringEscapeUtils {
        return buf.toString();
    }

-    /**
-     * Given a string containing entity escapes, returns a string
-     * containing the actual Unicode characters corresponding to the
-     * escapes.
-     *
-     * @see #escapeHtml(String)
-     **/
-    public static String unescapeHtml(String str) {
+    private static String unescapeEntities(String str, Entities entities) {
        StringBuffer buf = new StringBuffer(str.length());
        int i;
        for (i = 0; i < str.length(); ++i) {
@ -547,7 +450,7 @@ public class StringEscapeUtils {
                if (entity.charAt(0) == '#') {
                    iso = new Integer(entity.substring(1));
                } else {
-                    iso = entityValue(entity);
+                    iso = entities.entityValue(entity);
                }
                if (iso == null) {
                    buf.append("&" + entity + ";");
--- a/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java
+++ b/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java
@ -66,7 +66,7 @@ import junit.textui.TestRunner;
 *
 * @author of original StringUtilsTest.testEscape = ?
 * @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a>
- * @version $Id: StringEscapeUtilsTest.java,v 1.3 2003/04/09 18:45:29 alex Exp $
+ * @version $Id: StringEscapeUtilsTest.java,v 1.4 2003/04/09 20:55:33 alex Exp $
 */
 public class StringEscapeUtilsTest extends TestCase {
    private final static String FOO = "foo";
@ -143,8 +143,8 @@ public class StringEscapeUtilsTest extends TestCase {

        assertEquals("unescape(String) failed" +
                (message == null ? "" : (": " + message)) +
-                // we escape this so we can see it in the error message
                ": expected '" + StringUtils.escape(expected) +
+                // we escape this so we can see it in the error message
                "' actual '" + StringUtils.escape(actual) + "'",
                expected, actual);

@ -159,8 +159,16 @@ public class StringEscapeUtilsTest extends TestCase {
    }


-    // HTML
+    // HTML and XML
    //--------------------------------------------------------------
+
+    public void testEntitiesObject() throws Exception
+    {
+        assertEquals("gt", Entities.XML.entityName('>'));
+        assertEquals(new Integer('>'), Entities.XML.entityValue("gt"));
+        assertEquals(null, Entities.XML.entityValue("xyzzy"));
+    }
+
    String[][] htmlEscapes = {
        {"no escaping", "plain text", "plain text"},
        {"no escaping", "plain text", "plain text"},
@ -171,6 +179,8 @@ public class StringEscapeUtilsTest extends TestCase {
        {"first character only", "&lt; less than", "< less than"},
        {"apostrophe", "Huntington's chorea", "Huntington's chorea"},
        {"languages", "English,Fran&ccedil;ais,&#26085;&#26412;&#35486; (nihongo)", "English,Fran\u00E7ais,\u65E5\u672C\u8A9E (nihongo)"},
+        {"8-bit ascii doesn't number-escape", "~\u007F", "\u007E\u007F"},
+        {"8-bit ascii does number-escape", "&#128;&#159;", "\u0080\u009F"},
    };

    public void testEscapeHtml() {
@ -180,7 +190,7 @@ public class StringEscapeUtilsTest extends TestCase {
        }
    }

-    public void testHtmlunescape() {
+    public void testUnescapeHtml() {
        for (int i = 0; i < htmlEscapes.length; ++i) {
            assertEquals(htmlEscapes[i][0], htmlEscapes[i][2], StringEscapeUtils.unescapeHtml(htmlEscapes[i][1]));
            // todo: add test for (and implement) Writer-based version
@ -191,6 +201,25 @@ public class StringEscapeUtilsTest extends TestCase {
        assertEquals("funny chars pass through OK", "Fran\u00E7ais", StringEscapeUtils.unescapeHtml("Fran\u00E7ais"));
    }

+    public void testEscapeHtmlVersions() throws Exception
+    {
+        assertEquals("&Beta;", StringEscapeUtils.escapeHtml("\u0392"));
+        assertEquals("\u0392", StringEscapeUtils.unescapeHtml("&Beta;"));
+
+        //todo: refine API for escaping/unescaping specific HTML versions
+
+    }
+
+    public void testEscapeXml() throws Exception {
+        assertEquals("&lt;abc&gt;", StringEscapeUtils.escapeXml("<abc>"));
+        assertEquals("<abc>", StringEscapeUtils.unescapeXml("&lt;abc&gt;"));
+
+        assertEquals("XML should use numbers, not names for HTML entities",
+                "&#161;", StringEscapeUtils.escapeXml("\u00A1"));
+        assertEquals("XML should use numbers, not names for HTML entities",
+                "\u00A0", StringEscapeUtils.unescapeXml("&#160;"));
+    }
+
    // SQL
    // see http://www.jguru.com/faq/view.jsp?EID=8881
    //--------------------