lucene/gradle/generation/jflex/htmlentity.py

544 lines
31 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import sys
# A simple python script to generate an HTML entity map and a regex alternation
# for inclusion in HTMLStripCharFilter.jflex.
def main():
with open(sys.argv[1], 'w') as f:
sys.stdout = f
print(get_apache_license())
codes = {}
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
for line in get_entity_text().split('\n'):
match = regex.match(line)
if match:
key = match.group(1)
if key == 'quot': codes[key] = r'\"'
elif key == 'nbsp': codes[key] = ' ';
else : codes[key] = r'\u%04X' % int(match.group(2))
keys = sorted(codes)
first_entry = True
output_line = 'CharacterEntities = ( '
for key in keys:
new_entry = ('"%s"' if first_entry else ' | "%s"') % key
first_entry = False
if len(output_line) + len(new_entry) >= 80:
print(output_line)
output_line = ' '
output_line += new_entry
if key in ('quot','copy','gt','lt','reg','amp'):
new_entry = ' | "%s"' % key.upper()
if len(output_line) + len(new_entry) >= 80:
print(output_line)
output_line = ' '
output_line += new_entry
print(output_line, ')')
print('%{')
print(' private static final Map<String,String> upperCaseVariantsAccepted')
print(' = new HashMap<>();')
print(' static {')
print(' upperCaseVariantsAccepted.put("quot", "QUOT");')
print(' upperCaseVariantsAccepted.put("copy", "COPY");')
print(' upperCaseVariantsAccepted.put("gt", "GT");')
print(' upperCaseVariantsAccepted.put("lt", "LT");')
print(' upperCaseVariantsAccepted.put("reg", "REG");')
print(' upperCaseVariantsAccepted.put("amp", "AMP");')
print(' }')
print(' private static final CharArrayMap<Character> entityValues')
print(' = new CharArrayMap<>(%i, false);' % len(keys))
print(' static {')
print(' String[] entities = {')
output_line = ' '
for key in keys:
new_entry = ' "%s", "%s",' % (key, codes[key])
if len(output_line) + len(new_entry) >= 80:
print(output_line)
output_line = ' '
output_line += new_entry
print(output_line[:-1])
print(' };')
print(' for (int i = 0 ; i < entities.length ; i += 2) {')
print(' Character value = entities[i + 1].charAt(0);')
print(' entityValues.put(entities[i], value);')
print(' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);')
print(' if (upperCaseVariant != null) {')
print(' entityValues.put(upperCaseVariant, value);')
print(' }')
print(' }')
print(" }")
print("%}")
def get_entity_text():
# The text below is taken verbatim from
# <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
text = r"""
F.1. XHTML Character Entities
XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
F.1.1. XHTML Latin 1 Character Entities
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.
<!-- ...................................................................... -->
<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
<!-- file: xhtml-lat1.ent
Typical invocation:
<!ENTITY % xhtml-lat1
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
"xhtml-lat1.ent" >
%xhtml-lat1;
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"
Revision: Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
Portions (C) International Organization for Standardization 1986:
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879, provided
this notice is included in all copies.
-->
<!ENTITY nbsp "&#160;" ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
<!ENTITY iexcl "&#161;" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
<!ENTITY cent "&#162;" ><!-- cent sign, U+00A2 ISOnum -->
<!ENTITY pound "&#163;" ><!-- pound sign, U+00A3 ISOnum -->
<!ENTITY curren "&#164;" ><!-- currency sign, U+00A4 ISOnum -->
<!ENTITY yen "&#165;" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
<!ENTITY brvbar "&#166;" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
<!ENTITY sect "&#167;" ><!-- section sign, U+00A7 ISOnum -->
<!ENTITY uml "&#168;" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
<!ENTITY copy "&#169;" ><!-- copyright sign, U+00A9 ISOnum -->
<!ENTITY ordf "&#170;" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
<!ENTITY laquo "&#171;" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
<!ENTITY not "&#172;" ><!-- not sign, U+00AC ISOnum -->
<!ENTITY shy "&#173;" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
<!ENTITY reg "&#174;" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
<!ENTITY macr "&#175;" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
<!ENTITY deg "&#176;" ><!-- degree sign, U+00B0 ISOnum -->
<!ENTITY plusmn "&#177;" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
<!ENTITY sup2 "&#178;" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
<!ENTITY sup3 "&#179;" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
<!ENTITY acute "&#180;" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
<!ENTITY micro "&#181;" ><!-- micro sign, U+00B5 ISOnum -->
<!ENTITY para "&#182;" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
<!ENTITY middot "&#183;" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
<!ENTITY cedil "&#184;" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
<!ENTITY sup1 "&#185;" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
<!ENTITY ordm "&#186;" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
<!ENTITY raquo "&#187;" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
<!ENTITY frac14 "&#188;" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
<!ENTITY frac12 "&#189;" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
<!ENTITY frac34 "&#190;" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
<!ENTITY iquest "&#191;" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
<!ENTITY Agrave "&#192;" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
<!ENTITY Aacute "&#193;" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
<!ENTITY Acirc "&#194;" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
<!ENTITY Atilde "&#195;" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
<!ENTITY Auml "&#196;" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
<!ENTITY Aring "&#197;" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
<!ENTITY AElig "&#198;" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
<!ENTITY Ccedil "&#199;" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
<!ENTITY Egrave "&#200;" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
<!ENTITY Eacute "&#201;" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
<!ENTITY Ecirc "&#202;" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
<!ENTITY Euml "&#203;" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
<!ENTITY Igrave "&#204;" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
<!ENTITY Iacute "&#205;" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
<!ENTITY Icirc "&#206;" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
<!ENTITY Iuml "&#207;" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
<!ENTITY ETH "&#208;" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
<!ENTITY Ntilde "&#209;" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
<!ENTITY Ograve "&#210;" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
<!ENTITY Oacute "&#211;" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
<!ENTITY Ocirc "&#212;" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
<!ENTITY Otilde "&#213;" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
<!ENTITY Ouml "&#214;" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
<!ENTITY times "&#215;" ><!-- multiplication sign, U+00D7 ISOnum -->
<!ENTITY Oslash "&#216;" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
<!ENTITY Ugrave "&#217;" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
<!ENTITY Uacute "&#218;" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
<!ENTITY Ucirc "&#219;" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
<!ENTITY Uuml "&#220;" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
<!ENTITY Yacute "&#221;" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
<!ENTITY THORN "&#222;" ><!-- latin capital THORN, U+00DE ISOlat1 -->
<!ENTITY szlig "&#223;" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
<!ENTITY agrave "&#224;" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
<!ENTITY aacute "&#225;" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
<!ENTITY acirc "&#226;" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
<!ENTITY atilde "&#227;" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
<!ENTITY auml "&#228;" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
<!ENTITY aring "&#229;" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
<!ENTITY aelig "&#230;" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
<!ENTITY ccedil "&#231;" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
<!ENTITY egrave "&#232;" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
<!ENTITY eacute "&#233;" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
<!ENTITY ecirc "&#234;" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
<!ENTITY euml "&#235;" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
<!ENTITY igrave "&#236;" ><!-- latin small i with grave, U+00EC ISOlat1 -->
<!ENTITY iacute "&#237;" ><!-- latin small i with acute, U+00ED ISOlat1 -->
<!ENTITY icirc "&#238;" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
<!ENTITY iuml "&#239;" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
<!ENTITY eth "&#240;" ><!-- latin small eth, U+00F0 ISOlat1 -->
<!ENTITY ntilde "&#241;" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
<!ENTITY ograve "&#242;" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
<!ENTITY oacute "&#243;" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
<!ENTITY ocirc "&#244;" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
<!ENTITY otilde "&#245;" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
<!ENTITY ouml "&#246;" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
<!ENTITY divide "&#247;" ><!-- division sign, U+00F7 ISOnum -->
<!ENTITY oslash "&#248;" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
<!ENTITY ugrave "&#249;" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
<!ENTITY uacute "&#250;" ><!-- latin small u with acute, U+00FA ISOlat1 -->
<!ENTITY ucirc "&#251;" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
<!ENTITY uuml "&#252;" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
<!ENTITY yacute "&#253;" ><!-- latin small y with acute, U+00FD ISOlat1 -->
<!ENTITY thorn "&#254;" ><!-- latin small thorn with, U+00FE ISOlat1 -->
<!ENTITY yuml "&#255;" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
<!-- end of xhtml-lat1.ent -->
F.1.2. XHTML Special Characters
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.
<!-- ...................................................................... -->
<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
<!-- file: xhtml-special.ent
Typical invocation:
<!ENTITY % xhtml-special
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
"xhtml-special.ent" >
%xhtml-special;
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"
Revision: Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
Portions (C) International Organization for Standardization 1986:
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879, provided
this notice is included in all copies.
Revisions:
2000-10-28: added &apos; and altered XML Predefined Entities for compatibility
-->
<!-- Relevant ISO entity set is given unless names are newly introduced.
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
numbers are given for each character, in hex. Entity values are
decimal conversions of the ISO 10646 values and refer to the
document character set. Names are Unicode [UNICODE] names.
-->
<!-- C0 Controls and Basic Latin -->
<!ENTITY lt "&#38;#60;" ><!-- less-than sign, U+003C ISOnum -->
<!ENTITY gt "&#62;" ><!-- greater-than sign, U+003E ISOnum -->
<!ENTITY amp "&#38;#38;" ><!-- ampersand, U+0026 ISOnum -->
<!ENTITY apos "&#39;" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
<!ENTITY quot "&#34;" ><!-- quotation mark (Quote Double), U+0022 ISOnum -->
<!-- Latin Extended-A -->
<!ENTITY OElig "&#338;" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
<!ENTITY oelig "&#339;" ><!-- latin small ligature oe, U+0153 ISOlat2 -->
<!-- ligature is a misnomer, this is a separate character in some languages -->
<!ENTITY Scaron "&#352;" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
<!ENTITY scaron "&#353;" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
<!ENTITY Yuml "&#376;" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
<!-- Spacing Modifier Letters -->
<!ENTITY circ "&#710;" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
<!ENTITY tilde "&#732;" ><!-- small tilde, U+02DC ISOdia -->
<!-- General Punctuation -->
<!ENTITY ensp "&#8194;" ><!-- en space, U+2002 ISOpub -->
<!ENTITY emsp "&#8195;" ><!-- em space, U+2003 ISOpub -->
<!ENTITY thinsp "&#8201;" ><!-- thin space, U+2009 ISOpub -->
<!ENTITY zwnj "&#8204;" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
<!ENTITY zwj "&#8205;" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
<!ENTITY lrm "&#8206;" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
<!ENTITY rlm "&#8207;" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
<!ENTITY ndash "&#8211;" ><!-- en dash, U+2013 ISOpub -->
<!ENTITY mdash "&#8212;" ><!-- em dash, U+2014 ISOpub -->
<!ENTITY lsquo "&#8216;" ><!-- left single quotation mark, U+2018 ISOnum -->
<!ENTITY rsquo "&#8217;" ><!-- right single quotation mark, U+2019 ISOnum -->
<!ENTITY sbquo "&#8218;" ><!-- single low-9 quotation mark, U+201A NEW -->
<!ENTITY ldquo "&#8220;" ><!-- left double quotation mark, U+201C ISOnum -->
<!ENTITY rdquo "&#8221;" ><!-- right double quotation mark, U+201D ISOnum -->
<!ENTITY bdquo "&#8222;" ><!-- double low-9 quotation mark, U+201E NEW -->
<!ENTITY dagger "&#8224;" ><!-- dagger, U+2020 ISOpub -->
<!ENTITY Dagger "&#8225;" ><!-- double dagger, U+2021 ISOpub -->
<!ENTITY permil "&#8240;" ><!-- per mille sign, U+2030 ISOtech -->
<!-- lsaquo is proposed but not yet ISO standardized -->
<!ENTITY lsaquo "&#8249;" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
<!-- rsaquo is proposed but not yet ISO standardized -->
<!ENTITY rsaquo "&#8250;" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
<!ENTITY euro "&#8364;" ><!-- euro sign, U+20AC NEW -->
<!-- end of xhtml-special.ent -->
F.1.3. XHTML Mathematical, Greek, and Symbolic Characters
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.
<!-- ...................................................................... -->
<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
<!-- file: xhtml-symbol.ent
Typical invocation:
<!ENTITY % xhtml-symbol
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
"xhtml-symbol.ent" >
%xhtml-symbol;
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"
Revision: Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
Portions (C) International Organization for Standardization 1986:
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879, provided
this notice is included in all copies.
-->
<!-- Relevant ISO entity set is given unless names are newly introduced.
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
numbers are given for each character, in hex. Entity values are
decimal conversions of the ISO 10646 values and refer to the
document character set. Names are Unicode [UNICODE] names.
-->
<!-- Latin Extended-B -->
<!ENTITY fnof "&#402;" ><!-- latin small f with hook = function
= florin, U+0192 ISOtech -->
<!-- Greek -->
<!ENTITY Alpha "&#913;" ><!-- greek capital letter alpha, U+0391 -->
<!ENTITY Beta "&#914;" ><!-- greek capital letter beta, U+0392 -->
<!ENTITY Gamma "&#915;" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
<!ENTITY Delta "&#916;" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
<!ENTITY Epsilon "&#917;" ><!-- greek capital letter epsilon, U+0395 -->
<!ENTITY Zeta "&#918;" ><!-- greek capital letter zeta, U+0396 -->
<!ENTITY Eta "&#919;" ><!-- greek capital letter eta, U+0397 -->
<!ENTITY Theta "&#920;" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
<!ENTITY Iota "&#921;" ><!-- greek capital letter iota, U+0399 -->
<!ENTITY Kappa "&#922;" ><!-- greek capital letter kappa, U+039A -->
<!ENTITY Lambda "&#923;" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
<!ENTITY Mu "&#924;" ><!-- greek capital letter mu, U+039C -->
<!ENTITY Nu "&#925;" ><!-- greek capital letter nu, U+039D -->
<!ENTITY Xi "&#926;" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
<!ENTITY Omicron "&#927;" ><!-- greek capital letter omicron, U+039F -->
<!ENTITY Pi "&#928;" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
<!ENTITY Rho "&#929;" ><!-- greek capital letter rho, U+03A1 -->
<!-- there is no Sigmaf, and no U+03A2 character either -->
<!ENTITY Sigma "&#931;" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
<!ENTITY Tau "&#932;" ><!-- greek capital letter tau, U+03A4 -->
<!ENTITY Upsilon "&#933;" ><!-- greek capital letter upsilon,
U+03A5 ISOgrk3 -->
<!ENTITY Phi "&#934;" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
<!ENTITY Chi "&#935;" ><!-- greek capital letter chi, U+03A7 -->
<!ENTITY Psi "&#936;" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
<!ENTITY Omega "&#937;" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
<!ENTITY alpha "&#945;" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
<!ENTITY beta "&#946;" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
<!ENTITY gamma "&#947;" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
<!ENTITY delta "&#948;" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
<!ENTITY epsilon "&#949;" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
<!ENTITY zeta "&#950;" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
<!ENTITY eta "&#951;" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
<!ENTITY theta "&#952;" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
<!ENTITY iota "&#953;" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
<!ENTITY kappa "&#954;" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
<!ENTITY lambda "&#955;" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
<!ENTITY mu "&#956;" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
<!ENTITY nu "&#957;" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
<!ENTITY xi "&#958;" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
<!ENTITY omicron "&#959;" ><!-- greek small letter omicron, U+03BF NEW -->
<!ENTITY pi "&#960;" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
<!ENTITY rho "&#961;" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
<!ENTITY sigmaf "&#962;" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
<!ENTITY sigma "&#963;" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
<!ENTITY tau "&#964;" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
<!ENTITY upsilon "&#965;" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
<!ENTITY phi "&#966;" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
<!ENTITY chi "&#967;" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
<!ENTITY psi "&#968;" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
<!ENTITY omega "&#969;" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
<!ENTITY thetasym "&#977;" ><!-- greek small letter theta symbol, U+03D1 NEW -->
<!ENTITY upsih "&#978;" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
<!ENTITY piv "&#982;" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->
<!-- General Punctuation -->
<!ENTITY bull "&#8226;" ><!-- bullet = black small circle, U+2022 ISOpub -->
<!-- bullet is NOT the same as bullet operator, U+2219 -->
<!ENTITY hellip "&#8230;" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub -->
<!ENTITY prime "&#8242;" ><!-- prime = minutes = feet, U+2032 ISOtech -->
<!ENTITY Prime "&#8243;" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
<!ENTITY oline "&#8254;" ><!-- overline = spacing overscore, U+203E NEW -->
<!ENTITY frasl "&#8260;" ><!-- fraction slash, U+2044 NEW -->
<!-- Letterlike Symbols -->
<!ENTITY weierp "&#8472;" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
<!ENTITY image "&#8465;" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
<!ENTITY real "&#8476;" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
<!ENTITY trade "&#8482;" ><!-- trade mark sign, U+2122 ISOnum -->
<!ENTITY alefsym "&#8501;" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
the same glyph could be used to depict both characters -->
<!-- Arrows -->
<!ENTITY larr "&#8592;" ><!-- leftwards arrow, U+2190 ISOnum -->
<!ENTITY uarr "&#8593;" ><!-- upwards arrow, U+2191 ISOnum-->
<!ENTITY rarr "&#8594;" ><!-- rightwards arrow, U+2192 ISOnum -->
<!ENTITY darr "&#8595;" ><!-- downwards arrow, U+2193 ISOnum -->
<!ENTITY harr "&#8596;" ><!-- left right arrow, U+2194 ISOamsa -->
<!ENTITY crarr "&#8629;" ><!-- downwards arrow with corner leftwards
= carriage return, U+21B5 NEW -->
<!ENTITY lArr "&#8656;" ><!-- leftwards double arrow, U+21D0 ISOtech -->
<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
but also does not have any other character for that function. So ? lArr can
be used for 'is implied by' as ISOtech suggests -->
<!ENTITY uArr "&#8657;" ><!-- upwards double arrow, U+21D1 ISOamsa -->
<!ENTITY rArr "&#8658;" ><!-- rightwards double arrow, U+21D2 ISOtech -->
<!-- Unicode does not say this is the 'implies' character but does not have
another character with this function so ?
rArr can be used for 'implies' as ISOtech suggests -->
<!ENTITY dArr "&#8659;" ><!-- downwards double arrow, U+21D3 ISOamsa -->
<!ENTITY hArr "&#8660;" ><!-- left right double arrow, U+21D4 ISOamsa -->
<!-- Mathematical Operators -->
<!ENTITY forall "&#8704;" ><!-- for all, U+2200 ISOtech -->
<!ENTITY part "&#8706;" ><!-- partial differential, U+2202 ISOtech -->
<!ENTITY exist "&#8707;" ><!-- there exists, U+2203 ISOtech -->
<!ENTITY empty "&#8709;" ><!-- empty set = null set, U+2205 ISOamso -->
<!ENTITY nabla "&#8711;" ><!-- nabla = backward difference, U+2207 ISOtech -->
<!ENTITY isin "&#8712;" ><!-- element of, U+2208 ISOtech -->
<!ENTITY notin "&#8713;" ><!-- not an element of, U+2209 ISOtech -->
<!ENTITY ni "&#8715;" ><!-- contains as member, U+220B ISOtech -->
<!-- should there be a more memorable name than 'ni'? -->
<!ENTITY prod "&#8719;" ><!-- n-ary product = product sign, U+220F ISOamsb -->
<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
the same glyph might be used for both -->
<!ENTITY sum "&#8721;" ><!-- n-ary sumation, U+2211 ISOamsb -->
<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
though the same glyph might be used for both -->
<!ENTITY minus "&#8722;" ><!-- minus sign, U+2212 ISOtech -->
<!ENTITY lowast "&#8727;" ><!-- asterisk operator, U+2217 ISOtech -->
<!ENTITY radic "&#8730;" ><!-- square root = radical sign, U+221A ISOtech -->
<!ENTITY prop "&#8733;" ><!-- proportional to, U+221D ISOtech -->
<!ENTITY infin "&#8734;" ><!-- infinity, U+221E ISOtech -->
<!ENTITY ang "&#8736;" ><!-- angle, U+2220 ISOamso -->
<!ENTITY and "&#8743;" ><!-- logical and = wedge, U+2227 ISOtech -->
<!ENTITY or "&#8744;" ><!-- logical or = vee, U+2228 ISOtech -->
<!ENTITY cap "&#8745;" ><!-- intersection = cap, U+2229 ISOtech -->
<!ENTITY cup "&#8746;" ><!-- union = cup, U+222A ISOtech -->
<!ENTITY int "&#8747;" ><!-- integral, U+222B ISOtech -->
<!ENTITY there4 "&#8756;" ><!-- therefore, U+2234 ISOtech -->
<!ENTITY sim "&#8764;" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
<!-- tilde operator is NOT the same character as the tilde, U+007E,
although the same glyph might be used to represent both -->
<!ENTITY cong "&#8773;" ><!-- approximately equal to, U+2245 ISOtech -->
<!ENTITY asymp "&#8776;" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
<!ENTITY ne "&#8800;" ><!-- not equal to, U+2260 ISOtech -->
<!ENTITY equiv "&#8801;" ><!-- identical to, U+2261 ISOtech -->
<!ENTITY le "&#8804;" ><!-- less-than or equal to, U+2264 ISOtech -->
<!ENTITY ge "&#8805;" ><!-- greater-than or equal to, U+2265 ISOtech -->
<!ENTITY sub "&#8834;" ><!-- subset of, U+2282 ISOtech -->
<!ENTITY sup "&#8835;" ><!-- superset of, U+2283 ISOtech -->
<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
font encoding and is not included. Should it be, for symmetry?
It is in ISOamsn -->
<!ENTITY nsub "&#8836;" ><!-- not a subset of, U+2284 ISOamsn -->
<!ENTITY sube "&#8838;" ><!-- subset of or equal to, U+2286 ISOtech -->
<!ENTITY supe "&#8839;" ><!-- superset of or equal to, U+2287 ISOtech -->
<!ENTITY oplus "&#8853;" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
<!ENTITY otimes "&#8855;" ><!-- circled times = vector product, U+2297 ISOamsb -->
<!ENTITY perp "&#8869;" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
<!ENTITY sdot "&#8901;" ><!-- dot operator, U+22C5 ISOamsb -->
<!-- dot operator is NOT the same character as U+00B7 middle dot -->
<!-- Miscellaneous Technical -->
<!ENTITY lceil "&#8968;" ><!-- left ceiling = apl upstile, U+2308 ISOamsc -->
<!ENTITY rceil "&#8969;" ><!-- right ceiling, U+2309 ISOamsc -->
<!ENTITY lfloor "&#8970;" ><!-- left floor = apl downstile, U+230A ISOamsc -->
<!ENTITY rfloor "&#8971;" ><!-- right floor, U+230B ISOamsc -->
<!ENTITY lang "&#9001;" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
<!-- lang is NOT the same character as U+003C 'less than'
or U+2039 'single left-pointing angle quotation mark' -->
<!ENTITY rang "&#9002;" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
<!-- rang is NOT the same character as U+003E 'greater than'
or U+203A 'single right-pointing angle quotation mark' -->
<!-- Geometric Shapes -->
<!ENTITY loz "&#9674;" ><!-- lozenge, U+25CA ISOpub -->
<!-- Miscellaneous Symbols -->
<!ENTITY spades "&#9824;" ><!-- black spade suit, U+2660 ISOpub -->
<!-- black here seems to mean filled as opposed to hollow -->
<!ENTITY clubs "&#9827;" ><!-- black club suit = shamrock, U+2663 ISOpub -->
<!ENTITY hearts "&#9829;" ><!-- black heart suit = valentine, U+2665 ISOpub -->
<!ENTITY diams "&#9830;" ><!-- black diamond suit, U+2666 ISOpub -->
<!-- end of xhtml-symbol.ent -->
"""
return text
def get_apache_license():
license = r"""/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
"""
return license
main()