mirror of https://github.com/apache/lucene.git
544 lines
31 KiB
Python
544 lines
31 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import re
|
|
import sys
|
|
|
|
# A simple python script to generate an HTML entity map and a regex alternation
|
|
# for inclusion in HTMLStripCharFilter.jflex.
|
|
|
|
def main():
|
|
with open(sys.argv[1], 'w') as f:
|
|
sys.stdout = f
|
|
|
|
print(get_apache_license())
|
|
codes = {}
|
|
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
|
|
for line in get_entity_text().split('\n'):
|
|
match = regex.match(line)
|
|
if match:
|
|
key = match.group(1)
|
|
if key == 'quot': codes[key] = r'\"'
|
|
elif key == 'nbsp': codes[key] = ' ';
|
|
else : codes[key] = r'\u%04X' % int(match.group(2))
|
|
|
|
keys = sorted(codes)
|
|
|
|
first_entry = True
|
|
output_line = 'CharacterEntities = ( '
|
|
for key in keys:
|
|
new_entry = ('"%s"' if first_entry else ' | "%s"') % key
|
|
first_entry = False
|
|
if len(output_line) + len(new_entry) >= 80:
|
|
print(output_line)
|
|
output_line = ' '
|
|
output_line += new_entry
|
|
if key in ('quot','copy','gt','lt','reg','amp'):
|
|
new_entry = ' | "%s"' % key.upper()
|
|
if len(output_line) + len(new_entry) >= 80:
|
|
print(output_line)
|
|
output_line = ' '
|
|
output_line += new_entry
|
|
print(output_line, ')')
|
|
|
|
print('%{')
|
|
print(' private static final Map<String,String> upperCaseVariantsAccepted')
|
|
print(' = new HashMap<>();')
|
|
print(' static {')
|
|
print(' upperCaseVariantsAccepted.put("quot", "QUOT");')
|
|
print(' upperCaseVariantsAccepted.put("copy", "COPY");')
|
|
print(' upperCaseVariantsAccepted.put("gt", "GT");')
|
|
print(' upperCaseVariantsAccepted.put("lt", "LT");')
|
|
print(' upperCaseVariantsAccepted.put("reg", "REG");')
|
|
print(' upperCaseVariantsAccepted.put("amp", "AMP");')
|
|
print(' }')
|
|
print(' private static final CharArrayMap<Character> entityValues')
|
|
print(' = new CharArrayMap<>(%i, false);' % len(keys))
|
|
print(' static {')
|
|
print(' String[] entities = {')
|
|
output_line = ' '
|
|
for key in keys:
|
|
new_entry = ' "%s", "%s",' % (key, codes[key])
|
|
if len(output_line) + len(new_entry) >= 80:
|
|
print(output_line)
|
|
output_line = ' '
|
|
output_line += new_entry
|
|
print(output_line[:-1])
|
|
print(' };')
|
|
print(' for (int i = 0 ; i < entities.length ; i += 2) {')
|
|
print(' Character value = entities[i + 1].charAt(0);')
|
|
print(' entityValues.put(entities[i], value);')
|
|
print(' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);')
|
|
print(' if (upperCaseVariant != null) {')
|
|
print(' entityValues.put(upperCaseVariant, value);')
|
|
print(' }')
|
|
print(' }')
|
|
print(" }")
|
|
print("%}")
|
|
|
|
def get_entity_text():
|
|
# The text below is taken verbatim from
|
|
# <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
|
|
text = r"""
|
|
F.1. XHTML Character Entities
|
|
|
|
XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
|
|
F.1.1. XHTML Latin 1 Character Entities
|
|
|
|
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.
|
|
|
|
<!-- ...................................................................... -->
|
|
<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
|
|
<!-- file: xhtml-lat1.ent
|
|
|
|
Typical invocation:
|
|
|
|
<!ENTITY % xhtml-lat1
|
|
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
|
|
"xhtml-lat1.ent" >
|
|
%xhtml-lat1;
|
|
|
|
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
|
|
|
|
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
|
|
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"
|
|
|
|
Revision: Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
|
|
|
|
Portions (C) International Organization for Standardization 1986:
|
|
Permission to copy in any form is granted for use with conforming
|
|
SGML systems and applications as defined in ISO 8879, provided
|
|
this notice is included in all copies.
|
|
-->
|
|
|
|
<!ENTITY nbsp " " ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
|
|
<!ENTITY iexcl "¡" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
|
|
<!ENTITY cent "¢" ><!-- cent sign, U+00A2 ISOnum -->
|
|
<!ENTITY pound "£" ><!-- pound sign, U+00A3 ISOnum -->
|
|
<!ENTITY curren "¤" ><!-- currency sign, U+00A4 ISOnum -->
|
|
<!ENTITY yen "¥" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
|
|
<!ENTITY brvbar "¦" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
|
|
<!ENTITY sect "§" ><!-- section sign, U+00A7 ISOnum -->
|
|
<!ENTITY uml "¨" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
|
|
<!ENTITY copy "©" ><!-- copyright sign, U+00A9 ISOnum -->
|
|
<!ENTITY ordf "ª" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
|
|
<!ENTITY laquo "«" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
|
|
<!ENTITY not "¬" ><!-- not sign, U+00AC ISOnum -->
|
|
<!ENTITY shy "­" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
|
|
<!ENTITY reg "®" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
|
|
<!ENTITY macr "¯" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
|
|
<!ENTITY deg "°" ><!-- degree sign, U+00B0 ISOnum -->
|
|
<!ENTITY plusmn "±" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
|
|
<!ENTITY sup2 "²" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
|
|
<!ENTITY sup3 "³" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
|
|
<!ENTITY acute "´" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
|
|
<!ENTITY micro "µ" ><!-- micro sign, U+00B5 ISOnum -->
|
|
<!ENTITY para "¶" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
|
|
<!ENTITY middot "·" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
|
|
<!ENTITY cedil "¸" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
|
|
<!ENTITY sup1 "¹" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
|
|
<!ENTITY ordm "º" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
|
|
<!ENTITY raquo "»" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
|
|
<!ENTITY frac14 "¼" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
|
|
<!ENTITY frac12 "½" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
|
|
<!ENTITY frac34 "¾" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
|
|
<!ENTITY iquest "¿" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
|
|
<!ENTITY Agrave "À" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
|
|
<!ENTITY Aacute "Á" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
|
|
<!ENTITY Acirc "Â" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
|
|
<!ENTITY Atilde "Ã" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
|
|
<!ENTITY Auml "Ä" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
|
|
<!ENTITY Aring "Å" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
|
|
<!ENTITY AElig "Æ" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
|
|
<!ENTITY Ccedil "Ç" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
|
|
<!ENTITY Egrave "È" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
|
|
<!ENTITY Eacute "É" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
|
|
<!ENTITY Ecirc "Ê" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
|
|
<!ENTITY Euml "Ë" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
|
|
<!ENTITY Igrave "Ì" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
|
|
<!ENTITY Iacute "Í" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
|
|
<!ENTITY Icirc "Î" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
|
|
<!ENTITY Iuml "Ï" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
|
|
<!ENTITY ETH "Ð" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
|
|
<!ENTITY Ntilde "Ñ" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
|
|
<!ENTITY Ograve "Ò" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
|
|
<!ENTITY Oacute "Ó" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
|
|
<!ENTITY Ocirc "Ô" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
|
|
<!ENTITY Otilde "Õ" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
|
|
<!ENTITY Ouml "Ö" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
|
|
<!ENTITY times "×" ><!-- multiplication sign, U+00D7 ISOnum -->
|
|
<!ENTITY Oslash "Ø" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
|
|
<!ENTITY Ugrave "Ù" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
|
|
<!ENTITY Uacute "Ú" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
|
|
<!ENTITY Ucirc "Û" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
|
|
<!ENTITY Uuml "Ü" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
|
|
<!ENTITY Yacute "Ý" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
|
|
<!ENTITY THORN "Þ" ><!-- latin capital THORN, U+00DE ISOlat1 -->
|
|
<!ENTITY szlig "ß" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
|
|
<!ENTITY agrave "à" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
|
|
<!ENTITY aacute "á" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
|
|
<!ENTITY acirc "â" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
|
|
<!ENTITY atilde "ã" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
|
|
<!ENTITY auml "ä" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
|
|
<!ENTITY aring "å" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
|
|
<!ENTITY aelig "æ" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
|
|
<!ENTITY ccedil "ç" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
|
|
<!ENTITY egrave "è" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
|
|
<!ENTITY eacute "é" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
|
|
<!ENTITY ecirc "ê" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
|
|
<!ENTITY euml "ë" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
|
|
<!ENTITY igrave "ì" ><!-- latin small i with grave, U+00EC ISOlat1 -->
|
|
<!ENTITY iacute "í" ><!-- latin small i with acute, U+00ED ISOlat1 -->
|
|
<!ENTITY icirc "î" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
|
|
<!ENTITY iuml "ï" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
|
|
<!ENTITY eth "ð" ><!-- latin small eth, U+00F0 ISOlat1 -->
|
|
<!ENTITY ntilde "ñ" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
|
|
<!ENTITY ograve "ò" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
|
|
<!ENTITY oacute "ó" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
|
|
<!ENTITY ocirc "ô" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
|
|
<!ENTITY otilde "õ" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
|
|
<!ENTITY ouml "ö" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
|
|
<!ENTITY divide "÷" ><!-- division sign, U+00F7 ISOnum -->
|
|
<!ENTITY oslash "ø" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
|
|
<!ENTITY ugrave "ù" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
|
|
<!ENTITY uacute "ú" ><!-- latin small u with acute, U+00FA ISOlat1 -->
|
|
<!ENTITY ucirc "û" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
|
|
<!ENTITY uuml "ü" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
|
|
<!ENTITY yacute "ý" ><!-- latin small y with acute, U+00FD ISOlat1 -->
|
|
<!ENTITY thorn "þ" ><!-- latin small thorn with, U+00FE ISOlat1 -->
|
|
<!ENTITY yuml "ÿ" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
|
|
<!-- end of xhtml-lat1.ent -->
|
|
|
|
F.1.2. XHTML Special Characters
|
|
|
|
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.
|
|
|
|
<!-- ...................................................................... -->
|
|
<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
|
|
<!-- file: xhtml-special.ent
|
|
|
|
Typical invocation:
|
|
|
|
<!ENTITY % xhtml-special
|
|
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
|
|
"xhtml-special.ent" >
|
|
%xhtml-special;
|
|
|
|
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
|
|
|
|
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
|
|
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"
|
|
|
|
Revision: Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
|
|
|
|
Portions (C) International Organization for Standardization 1986:
|
|
Permission to copy in any form is granted for use with conforming
|
|
SGML systems and applications as defined in ISO 8879, provided
|
|
this notice is included in all copies.
|
|
|
|
Revisions:
|
|
2000-10-28: added ' and altered XML Predefined Entities for compatibility
|
|
-->
|
|
|
|
<!-- Relevant ISO entity set is given unless names are newly introduced.
|
|
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
|
|
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
|
|
numbers are given for each character, in hex. Entity values are
|
|
decimal conversions of the ISO 10646 values and refer to the
|
|
document character set. Names are Unicode [UNICODE] names.
|
|
-->
|
|
|
|
<!-- C0 Controls and Basic Latin -->
|
|
<!ENTITY lt "&#60;" ><!-- less-than sign, U+003C ISOnum -->
|
|
<!ENTITY gt ">" ><!-- greater-than sign, U+003E ISOnum -->
|
|
<!ENTITY amp "&#38;" ><!-- ampersand, U+0026 ISOnum -->
|
|
<!ENTITY apos "'" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
|
|
<!ENTITY quot """ ><!-- quotation mark (Quote Double), U+0022 ISOnum -->
|
|
|
|
<!-- Latin Extended-A -->
|
|
<!ENTITY OElig "Œ" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
|
|
<!ENTITY oelig "œ" ><!-- latin small ligature oe, U+0153 ISOlat2 -->
|
|
|
|
<!-- ligature is a misnomer, this is a separate character in some languages -->
|
|
<!ENTITY Scaron "Š" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
|
|
<!ENTITY scaron "š" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
|
|
<!ENTITY Yuml "Ÿ" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
|
|
|
|
<!-- Spacing Modifier Letters -->
|
|
<!ENTITY circ "ˆ" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
|
|
<!ENTITY tilde "˜" ><!-- small tilde, U+02DC ISOdia -->
|
|
|
|
<!-- General Punctuation -->
|
|
<!ENTITY ensp " " ><!-- en space, U+2002 ISOpub -->
|
|
<!ENTITY emsp " " ><!-- em space, U+2003 ISOpub -->
|
|
<!ENTITY thinsp " " ><!-- thin space, U+2009 ISOpub -->
|
|
<!ENTITY zwnj "‌" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
|
|
<!ENTITY zwj "‍" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
|
|
<!ENTITY lrm "‎" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
|
|
<!ENTITY rlm "‏" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
|
|
<!ENTITY ndash "–" ><!-- en dash, U+2013 ISOpub -->
|
|
<!ENTITY mdash "—" ><!-- em dash, U+2014 ISOpub -->
|
|
<!ENTITY lsquo "‘" ><!-- left single quotation mark, U+2018 ISOnum -->
|
|
<!ENTITY rsquo "’" ><!-- right single quotation mark, U+2019 ISOnum -->
|
|
<!ENTITY sbquo "‚" ><!-- single low-9 quotation mark, U+201A NEW -->
|
|
<!ENTITY ldquo "“" ><!-- left double quotation mark, U+201C ISOnum -->
|
|
<!ENTITY rdquo "”" ><!-- right double quotation mark, U+201D ISOnum -->
|
|
<!ENTITY bdquo "„" ><!-- double low-9 quotation mark, U+201E NEW -->
|
|
<!ENTITY dagger "†" ><!-- dagger, U+2020 ISOpub -->
|
|
<!ENTITY Dagger "‡" ><!-- double dagger, U+2021 ISOpub -->
|
|
<!ENTITY permil "‰" ><!-- per mille sign, U+2030 ISOtech -->
|
|
|
|
<!-- lsaquo is proposed but not yet ISO standardized -->
|
|
<!ENTITY lsaquo "‹" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
|
|
<!-- rsaquo is proposed but not yet ISO standardized -->
|
|
<!ENTITY rsaquo "›" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
|
|
<!ENTITY euro "€" ><!-- euro sign, U+20AC NEW -->
|
|
|
|
<!-- end of xhtml-special.ent -->
|
|
|
|
F.1.3. XHTML Mathematical, Greek, and Symbolic Characters
|
|
|
|
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.
|
|
|
|
<!-- ...................................................................... -->
|
|
<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
|
|
<!-- file: xhtml-symbol.ent
|
|
|
|
Typical invocation:
|
|
|
|
<!ENTITY % xhtml-symbol
|
|
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
|
|
"xhtml-symbol.ent" >
|
|
%xhtml-symbol;
|
|
|
|
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
|
|
|
|
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
|
|
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"
|
|
|
|
Revision: Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
|
|
|
|
Portions (C) International Organization for Standardization 1986:
|
|
Permission to copy in any form is granted for use with conforming
|
|
SGML systems and applications as defined in ISO 8879, provided
|
|
this notice is included in all copies.
|
|
-->
|
|
|
|
<!-- Relevant ISO entity set is given unless names are newly introduced.
|
|
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
|
|
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
|
|
numbers are given for each character, in hex. Entity values are
|
|
decimal conversions of the ISO 10646 values and refer to the
|
|
document character set. Names are Unicode [UNICODE] names.
|
|
-->
|
|
|
|
<!-- Latin Extended-B -->
|
|
<!ENTITY fnof "ƒ" ><!-- latin small f with hook = function
|
|
= florin, U+0192 ISOtech -->
|
|
|
|
<!-- Greek -->
|
|
<!ENTITY Alpha "Α" ><!-- greek capital letter alpha, U+0391 -->
|
|
<!ENTITY Beta "Β" ><!-- greek capital letter beta, U+0392 -->
|
|
<!ENTITY Gamma "Γ" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
|
|
<!ENTITY Delta "Δ" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
|
|
<!ENTITY Epsilon "Ε" ><!-- greek capital letter epsilon, U+0395 -->
|
|
<!ENTITY Zeta "Ζ" ><!-- greek capital letter zeta, U+0396 -->
|
|
<!ENTITY Eta "Η" ><!-- greek capital letter eta, U+0397 -->
|
|
<!ENTITY Theta "Θ" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
|
|
<!ENTITY Iota "Ι" ><!-- greek capital letter iota, U+0399 -->
|
|
<!ENTITY Kappa "Κ" ><!-- greek capital letter kappa, U+039A -->
|
|
<!ENTITY Lambda "Λ" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
|
|
<!ENTITY Mu "Μ" ><!-- greek capital letter mu, U+039C -->
|
|
<!ENTITY Nu "Ν" ><!-- greek capital letter nu, U+039D -->
|
|
<!ENTITY Xi "Ξ" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
|
|
<!ENTITY Omicron "Ο" ><!-- greek capital letter omicron, U+039F -->
|
|
<!ENTITY Pi "Π" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
|
|
<!ENTITY Rho "Ρ" ><!-- greek capital letter rho, U+03A1 -->
|
|
<!-- there is no Sigmaf, and no U+03A2 character either -->
|
|
<!ENTITY Sigma "Σ" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
|
|
<!ENTITY Tau "Τ" ><!-- greek capital letter tau, U+03A4 -->
|
|
<!ENTITY Upsilon "Υ" ><!-- greek capital letter upsilon,
|
|
U+03A5 ISOgrk3 -->
|
|
<!ENTITY Phi "Φ" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
|
|
<!ENTITY Chi "Χ" ><!-- greek capital letter chi, U+03A7 -->
|
|
<!ENTITY Psi "Ψ" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
|
|
<!ENTITY Omega "Ω" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
|
|
<!ENTITY alpha "α" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
|
|
<!ENTITY beta "β" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
|
|
<!ENTITY gamma "γ" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
|
|
<!ENTITY delta "δ" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
|
|
<!ENTITY epsilon "ε" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
|
|
<!ENTITY zeta "ζ" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
|
|
<!ENTITY eta "η" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
|
|
<!ENTITY theta "θ" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
|
|
<!ENTITY iota "ι" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
|
|
<!ENTITY kappa "κ" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
|
|
<!ENTITY lambda "λ" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
|
|
<!ENTITY mu "μ" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
|
|
<!ENTITY nu "ν" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
|
|
<!ENTITY xi "ξ" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
|
|
<!ENTITY omicron "ο" ><!-- greek small letter omicron, U+03BF NEW -->
|
|
<!ENTITY pi "π" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
|
|
<!ENTITY rho "ρ" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
|
|
<!ENTITY sigmaf "ς" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
|
|
<!ENTITY sigma "σ" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
|
|
<!ENTITY tau "τ" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
|
|
<!ENTITY upsilon "υ" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
|
|
<!ENTITY phi "φ" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
|
|
<!ENTITY chi "χ" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
|
|
<!ENTITY psi "ψ" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
|
|
<!ENTITY omega "ω" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
|
|
<!ENTITY thetasym "ϑ" ><!-- greek small letter theta symbol, U+03D1 NEW -->
|
|
<!ENTITY upsih "ϒ" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
|
|
<!ENTITY piv "ϖ" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->
|
|
|
|
<!-- General Punctuation -->
|
|
<!ENTITY bull "•" ><!-- bullet = black small circle, U+2022 ISOpub -->
|
|
<!-- bullet is NOT the same as bullet operator, U+2219 -->
|
|
<!ENTITY hellip "…" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub -->
|
|
<!ENTITY prime "′" ><!-- prime = minutes = feet, U+2032 ISOtech -->
|
|
<!ENTITY Prime "″" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
|
|
<!ENTITY oline "‾" ><!-- overline = spacing overscore, U+203E NEW -->
|
|
<!ENTITY frasl "⁄" ><!-- fraction slash, U+2044 NEW -->
|
|
|
|
<!-- Letterlike Symbols -->
|
|
<!ENTITY weierp "℘" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
|
|
<!ENTITY image "ℑ" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
|
|
<!ENTITY real "ℜ" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
|
|
<!ENTITY trade "™" ><!-- trade mark sign, U+2122 ISOnum -->
|
|
<!ENTITY alefsym "ℵ" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
|
|
<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
|
|
the same glyph could be used to depict both characters -->
|
|
|
|
<!-- Arrows -->
|
|
<!ENTITY larr "←" ><!-- leftwards arrow, U+2190 ISOnum -->
|
|
<!ENTITY uarr "↑" ><!-- upwards arrow, U+2191 ISOnum-->
|
|
<!ENTITY rarr "→" ><!-- rightwards arrow, U+2192 ISOnum -->
|
|
<!ENTITY darr "↓" ><!-- downwards arrow, U+2193 ISOnum -->
|
|
<!ENTITY harr "↔" ><!-- left right arrow, U+2194 ISOamsa -->
|
|
<!ENTITY crarr "↵" ><!-- downwards arrow with corner leftwards
|
|
= carriage return, U+21B5 NEW -->
|
|
<!ENTITY lArr "⇐" ><!-- leftwards double arrow, U+21D0 ISOtech -->
|
|
<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
|
|
but also does not have any other character for that function. So ? lArr can
|
|
be used for 'is implied by' as ISOtech suggests -->
|
|
<!ENTITY uArr "⇑" ><!-- upwards double arrow, U+21D1 ISOamsa -->
|
|
<!ENTITY rArr "⇒" ><!-- rightwards double arrow, U+21D2 ISOtech -->
|
|
<!-- Unicode does not say this is the 'implies' character but does not have
|
|
another character with this function so ?
|
|
rArr can be used for 'implies' as ISOtech suggests -->
|
|
<!ENTITY dArr "⇓" ><!-- downwards double arrow, U+21D3 ISOamsa -->
|
|
<!ENTITY hArr "⇔" ><!-- left right double arrow, U+21D4 ISOamsa -->
|
|
|
|
<!-- Mathematical Operators -->
|
|
<!ENTITY forall "∀" ><!-- for all, U+2200 ISOtech -->
|
|
<!ENTITY part "∂" ><!-- partial differential, U+2202 ISOtech -->
|
|
<!ENTITY exist "∃" ><!-- there exists, U+2203 ISOtech -->
|
|
<!ENTITY empty "∅" ><!-- empty set = null set, U+2205 ISOamso -->
|
|
<!ENTITY nabla "∇" ><!-- nabla = backward difference, U+2207 ISOtech -->
|
|
<!ENTITY isin "∈" ><!-- element of, U+2208 ISOtech -->
|
|
<!ENTITY notin "∉" ><!-- not an element of, U+2209 ISOtech -->
|
|
<!ENTITY ni "∋" ><!-- contains as member, U+220B ISOtech -->
|
|
<!-- should there be a more memorable name than 'ni'? -->
|
|
<!ENTITY prod "∏" ><!-- n-ary product = product sign, U+220F ISOamsb -->
|
|
<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
|
|
the same glyph might be used for both -->
|
|
<!ENTITY sum "∑" ><!-- n-ary sumation, U+2211 ISOamsb -->
|
|
<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
|
|
though the same glyph might be used for both -->
|
|
<!ENTITY minus "−" ><!-- minus sign, U+2212 ISOtech -->
|
|
<!ENTITY lowast "∗" ><!-- asterisk operator, U+2217 ISOtech -->
|
|
<!ENTITY radic "√" ><!-- square root = radical sign, U+221A ISOtech -->
|
|
<!ENTITY prop "∝" ><!-- proportional to, U+221D ISOtech -->
|
|
<!ENTITY infin "∞" ><!-- infinity, U+221E ISOtech -->
|
|
<!ENTITY ang "∠" ><!-- angle, U+2220 ISOamso -->
|
|
<!ENTITY and "∧" ><!-- logical and = wedge, U+2227 ISOtech -->
|
|
<!ENTITY or "∨" ><!-- logical or = vee, U+2228 ISOtech -->
|
|
<!ENTITY cap "∩" ><!-- intersection = cap, U+2229 ISOtech -->
|
|
<!ENTITY cup "∪" ><!-- union = cup, U+222A ISOtech -->
|
|
<!ENTITY int "∫" ><!-- integral, U+222B ISOtech -->
|
|
<!ENTITY there4 "∴" ><!-- therefore, U+2234 ISOtech -->
|
|
<!ENTITY sim "∼" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
|
|
<!-- tilde operator is NOT the same character as the tilde, U+007E,
|
|
although the same glyph might be used to represent both -->
|
|
<!ENTITY cong "≅" ><!-- approximately equal to, U+2245 ISOtech -->
|
|
<!ENTITY asymp "≈" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
|
|
<!ENTITY ne "≠" ><!-- not equal to, U+2260 ISOtech -->
|
|
<!ENTITY equiv "≡" ><!-- identical to, U+2261 ISOtech -->
|
|
<!ENTITY le "≤" ><!-- less-than or equal to, U+2264 ISOtech -->
|
|
<!ENTITY ge "≥" ><!-- greater-than or equal to, U+2265 ISOtech -->
|
|
<!ENTITY sub "⊂" ><!-- subset of, U+2282 ISOtech -->
|
|
<!ENTITY sup "⊃" ><!-- superset of, U+2283 ISOtech -->
|
|
<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
|
|
font encoding and is not included. Should it be, for symmetry?
|
|
It is in ISOamsn -->
|
|
<!ENTITY nsub "⊄" ><!-- not a subset of, U+2284 ISOamsn -->
|
|
<!ENTITY sube "⊆" ><!-- subset of or equal to, U+2286 ISOtech -->
|
|
<!ENTITY supe "⊇" ><!-- superset of or equal to, U+2287 ISOtech -->
|
|
<!ENTITY oplus "⊕" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
|
|
<!ENTITY otimes "⊗" ><!-- circled times = vector product, U+2297 ISOamsb -->
|
|
<!ENTITY perp "⊥" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
|
|
<!ENTITY sdot "⋅" ><!-- dot operator, U+22C5 ISOamsb -->
|
|
<!-- dot operator is NOT the same character as U+00B7 middle dot -->
|
|
|
|
<!-- Miscellaneous Technical -->
|
|
<!ENTITY lceil "⌈" ><!-- left ceiling = apl upstile, U+2308 ISOamsc -->
|
|
<!ENTITY rceil "⌉" ><!-- right ceiling, U+2309 ISOamsc -->
|
|
<!ENTITY lfloor "⌊" ><!-- left floor = apl downstile, U+230A ISOamsc -->
|
|
<!ENTITY rfloor "⌋" ><!-- right floor, U+230B ISOamsc -->
|
|
<!ENTITY lang "〈" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
|
|
<!-- lang is NOT the same character as U+003C 'less than'
|
|
or U+2039 'single left-pointing angle quotation mark' -->
|
|
<!ENTITY rang "〉" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
|
|
<!-- rang is NOT the same character as U+003E 'greater than'
|
|
or U+203A 'single right-pointing angle quotation mark' -->
|
|
|
|
<!-- Geometric Shapes -->
|
|
<!ENTITY loz "◊" ><!-- lozenge, U+25CA ISOpub -->
|
|
|
|
<!-- Miscellaneous Symbols -->
|
|
<!ENTITY spades "♠" ><!-- black spade suit, U+2660 ISOpub -->
|
|
<!-- black here seems to mean filled as opposed to hollow -->
|
|
<!ENTITY clubs "♣" ><!-- black club suit = shamrock, U+2663 ISOpub -->
|
|
<!ENTITY hearts "♥" ><!-- black heart suit = valentine, U+2665 ISOpub -->
|
|
<!ENTITY diams "♦" ><!-- black diamond suit, U+2666 ISOpub -->
|
|
|
|
<!-- end of xhtml-symbol.ent -->
|
|
"""
|
|
return text
|
|
|
|
def get_apache_license():
|
|
license = r"""/*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed with
|
|
* this work for additional information regarding copyright ownership.
|
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
* (the "License"); you may not use this file except in compliance with
|
|
* the License. You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
"""
|
|
return license
|
|
|
|
main()
|