mirror of https://github.com/apache/lucene.git
fix broken unicode in javadocs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1381711 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bcef091d82
commit
ab7d8397ef
|
@ -23,6 +23,9 @@ import urllib.parse as urlparse
|
||||||
reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
|
reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
|
||||||
reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)
|
reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)
|
||||||
|
|
||||||
|
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
|
||||||
|
reValidChar = re.compile("^[\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]*$")
|
||||||
|
|
||||||
# silly emacs: '
|
# silly emacs: '
|
||||||
|
|
||||||
class FindHyperlinks(HTMLParser):
|
class FindHyperlinks(HTMLParser):
|
||||||
|
@ -79,6 +82,12 @@ class FindHyperlinks(HTMLParser):
|
||||||
|
|
||||||
def parse(baseURL, html):
|
def parse(baseURL, html):
|
||||||
global failures
|
global failures
|
||||||
|
# look for broken unicode
|
||||||
|
if not reValidChar.match(html):
|
||||||
|
print(' WARNING: invalid characters detected in: %s' % baseURL)
|
||||||
|
failures = True
|
||||||
|
return [], []
|
||||||
|
|
||||||
parser = FindHyperlinks(baseURL)
|
parser = FindHyperlinks(baseURL)
|
||||||
try:
|
try:
|
||||||
parser.feed(html)
|
parser.feed(html)
|
||||||
|
|
|
@ -227,8 +227,8 @@ public class SynonymMap {
|
||||||
/**
|
/**
|
||||||
* Add a phrase->phrase synonym mapping.
|
* Add a phrase->phrase synonym mapping.
|
||||||
* Phrases are character sequences where words are
|
* Phrases are character sequences where words are
|
||||||
* separated with character zero (\u0000). Empty words
|
* separated with character zero (U+0000). Empty words
|
||||||
* (two \u0000s in a row) are not allowed in the input nor
|
* (two U+0000s in a row) are not allowed in the input nor
|
||||||
* the output!
|
* the output!
|
||||||
*
|
*
|
||||||
* @param input input phrase
|
* @param input input phrase
|
||||||
|
|
Loading…
Reference in New Issue