From ab7d8397efe55b78331e00c0872303f76a2cf68c Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 6 Sep 2012 18:37:38 +0000 Subject: [PATCH] fix broken unicode in javadocs git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1381711 13f79535-47bb-0310-9956-ffa450edef68 --- dev-tools/scripts/checkJavadocLinks.py | 9 +++++++++ .../org/apache/lucene/analysis/synonym/SynonymMap.java | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/dev-tools/scripts/checkJavadocLinks.py b/dev-tools/scripts/checkJavadocLinks.py index 736a3c8781a..1372e8f3a5a 100644 --- a/dev-tools/scripts/checkJavadocLinks.py +++ b/dev-tools/scripts/checkJavadocLinks.py @@ -23,6 +23,9 @@ import urllib.parse as urlparse reHyperlink = re.compile(r'', re.I) reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I) +# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */ +reValidChar = re.compile("^[\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]*$") + # silly emacs: ' class FindHyperlinks(HTMLParser): @@ -79,6 +82,12 @@ class FindHyperlinks(HTMLParser): def parse(baseURL, html): global failures + # look for broken unicode + if not reValidChar.match(html): + print(' WARNING: invalid characters detected in: %s' % baseURL) + failures = True + return [], [] + parser = FindHyperlinks(baseURL) try: parser.feed(html) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java index 5de0c6cff0f..9a7555946f4 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java @@ -227,8 +227,8 @@ public class SynonymMap { /** * Add a phrase->phrase synonym mapping. * Phrases are character sequences where words are - * separated with character zero (\u0000). Empty words - * (two \u0000s in a row) are not allowed in the input nor + * separated with character zero (U+0000). Empty words + * (two U+0000s in a row) are not allowed in the input nor * the output! * * @param input input phrase