fix broken unicode in javadocs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1381711 13f79535-47bb-0310-9956-ffa450edef68
2012-09-06 18:37:38 +00:00 · 2012-09-06 18:37:38 +00:00 · ab7d8397ef
parent bcef091d82
commit ab7d8397ef
2 changed files with 11 additions and 2 deletions
--- a/dev-tools/scripts/checkJavadocLinks.py
+++ b/dev-tools/scripts/checkJavadocLinks.py
@ -23,6 +23,9 @@ import urllib.parse as urlparse
 reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
 reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)

+# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
+reValidChar = re.compile("^[\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]*$")
+
 # silly emacs: '

 class FindHyperlinks(HTMLParser):
@ -79,6 +82,12 @@ class FindHyperlinks(HTMLParser):
                   
 def parse(baseURL, html):
  global failures
+  # look for broken unicode
+  if not reValidChar.match(html):
+    print(' WARNING: invalid characters detected in: %s' % baseURL)
+    failures = True
+    return [], []
+
  parser = FindHyperlinks(baseURL)
  try:
    parser.feed(html)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
@ -227,8 +227,8 @@ public class SynonymMap {
    /**
     * Add a phrase->phrase synonym mapping.
     * Phrases are character sequences where words are
-     * separated with character zero (\u0000).  Empty words
-     * (two \u0000s in a row) are not allowed in the input nor
+     * separated with character zero (U+0000).  Empty words
+     * (two U+0000s in a row) are not allowed in the input nor
     * the output!
     * 
     * @param input input phrase