# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re import sys # A simple python script to generate an HTML entity map and a regex alternation # for inclusion in HTMLStripCharFilter.jflex. def main(): with open(sys.argv[1], 'w') as f: sys.stdout = f print(get_apache_license()) codes = {} regex = re.compile(r'\s*= 80: print(output_line) output_line = ' ' output_line += new_entry if key in ('quot','copy','gt','lt','reg','amp'): new_entry = ' | "%s"' % key.upper() if len(output_line) + len(new_entry) >= 80: print(output_line) output_line = ' ' output_line += new_entry print(output_line, ')') print('%{') print(' private static final Map upperCaseVariantsAccepted') print(' = new HashMap<>();') print(' static {') print(' upperCaseVariantsAccepted.put("quot", "QUOT");') print(' upperCaseVariantsAccepted.put("copy", "COPY");') print(' upperCaseVariantsAccepted.put("gt", "GT");') print(' upperCaseVariantsAccepted.put("lt", "LT");') print(' upperCaseVariantsAccepted.put("reg", "REG");') print(' upperCaseVariantsAccepted.put("amp", "AMP");') print(' }') print(' private static final CharArrayMap entityValues') print(' = new CharArrayMap<>(%i, false);' % len(keys)) print(' static {') print(' String[] entities = {') output_line = ' ' for key in keys: new_entry = ' "%s", "%s",' % (key, codes[key]) if len(output_line) + len(new_entry) >= 80: print(output_line) output_line = ' ' output_line += new_entry print(output_line[:-1]) print(' };') print(' for (int i = 0 ; i < entities.length ; i += 2) {') print(' Character value = entities[i + 1].charAt(0);') print(' entityValues.put(entities[i], value);') print(' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);') print(' if (upperCaseVariant != null) {') print(' entityValues.put(upperCaseVariant, value);') print(' }') print(' }') print(" }") print("%}") def get_entity_text(): # The text below is taken verbatim from # : text = r""" F.1. XHTML Character Entities XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section. F.1.1. XHTML Latin 1 Character Entities You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent. F.1.2. XHTML Special Characters You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent. F.1.3. XHTML Mathematical, Greek, and Symbolic Characters You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent. """ return text def get_apache_license(): license = r"""/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ """ return license main()