# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import traceback
import os
import sys
import re
from html.parser import HTMLParser, HTMLParseError
import urllib.parse as urlparse
reHyperlink = re.compile(r'', re.I)
reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
reValidChar = re.compile("^[^\u0000-\u0008\u000B-\u000C\u000E-\u001F\uFFFE\uFFFF]*$")
# silly emacs: '
class FindHyperlinks(HTMLParser):
def __init__(self, baseURL):
HTMLParser.__init__(self)
self.stack = []
self.anchors = set()
self.links = []
self.baseURL = baseURL
self.printed = False
def handle_starttag(self, tag, attrs):
# NOTE: I don't think 'a' should be in here. But try debugging
# NumericRangeQuery.html. (Could be javadocs bug, it's a generic type...)
if tag not in ('link', 'meta', 'frame', 'br', 'hr', 'p', 'li', 'img', 'col', 'a'):
self.stack.append(tag)
if tag == 'a':
name = None
href = None
for attName, attValue in attrs:
if attName == 'name':
name = attValue
elif attName == 'href':
href = attValue
if name is not None:
assert href is None
if name in self.anchors:
if name in ('serializedForm',
'serialized_methods',
'readObject(java.io.ObjectInputStream)',
'writeObject(java.io.ObjectOutputStream)') \
and self.baseURL.endswith('/serialized-form.html'):
# Seems like a bug in Javadoc generation... you can't have
# same anchor name more than once...
pass
else:
self.printFile()
raise RuntimeError('anchor "%s" appears more than once' % name)
else:
self.anchors.add(name)
elif href is not None:
assert name is None
href = href.strip()
self.links.append(urlparse.urljoin(self.baseURL, href))
else:
if self.baseURL.endswith('/AttributeSource.html'):
# LUCENE-4010: AttributeSource's javadocs has an unescaped generics!! Seems to be a javadocs bug... (fixed in Java 7)
pass
else:
raise RuntimeError('couldn\'t find an href nor name in link in %s: only got these attrs: %s' % (self.baseURL, attrs))
def handle_endtag(self, tag):
if tag in ('link', 'meta', 'frame', 'br', 'hr', 'p', 'li', 'img', 'col', 'a'):
return
if len(self.stack) == 0:
raise RuntimeError('%s %s:%s: saw %s> no opening <%s>' % (self.baseURL, self.getpos()[0], self.getpos()[1], tag, self.stack[-1]))
if self.stack[-1] == tag:
self.stack.pop()
else:
raise RuntimeError('%s %s:%s: saw %s> but expected %s>' % (self.baseURL, self.getpos()[0], self.getpos()[1], tag, self.stack[-1]))
def printFile(self):
if not self.printed:
print()
print(' ' + self.baseURL)
self.printed = True
def parse(baseURL, html):
global failures
# look for broken unicode
if not reValidChar.match(html):
print(' WARNING: invalid characters detected in: %s' % baseURL)
failures = True
return [], []
parser = FindHyperlinks(baseURL)
try:
parser.feed(html)
parser.close()
except HTMLParseError:
parser.printFile()
print(' WARNING: failed to parse %s:' % baseURL)
traceback.print_exc(file=sys.stdout)
failures = True
return [], []
#print ' %d links, %d anchors' % \
# (len(parser.links), len(parser.anchors))
return parser.links, parser.anchors
failures = False
def checkAll(dirName):
"""
Checks *.html (recursively) under this directory.
"""
global failures
# Find/parse all HTML files first
print()
print('Crawl/parse...')
allFiles = {}
if os.path.isfile(dirName):
root, fileName = os.path.split(dirName)
iter = ((root, [], [fileName]),)
else:
iter = os.walk(dirName)
for root, dirs, files in iter:
for f in files:
main, ext = os.path.splitext(f)
ext = ext.lower()
# maybe?:
# and main not in ('serialized-form'):
if ext in ('.htm', '.html') and \
not f.startswith('.#') and \
main not in ('deprecated-list',):
# Somehow even w/ java 7 generaged javadocs,
# deprecated-list.html can fail to escape generics types
fullPath = os.path.join(root, f).replace(os.path.sep,'/')
fullPath = 'file:%s' % urlparse.quote(fullPath)
# parse and unparse the URL to "normalize" it
fullPath = urlparse.urlunparse(urlparse.urlparse(fullPath))
#print ' %s' % fullPath
allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f), encoding='UTF-8').read())
# ... then verify:
print()
print('Verify...')
for fullPath, (links, anchors) in allFiles.items():
#print fullPath
printed = False
for link in links:
origLink = link
# TODO: use urlparse?
idx = link.find('#')
if idx != -1:
anchor = link[idx+1:]
link = link[:idx]
else:
anchor = None
# remove any whitespace from the middle of the link
link = ''.join(link.split())
idx = link.find('?')
if idx != -1:
link = link[:idx]
# TODO: normalize path sep for windows...
if link.startswith('http://') or link.startswith('https://'):
# don't check external links
if link.find('lucene.apache.org/java/docs/mailinglists.html') != -1:
# OK
pass
elif link == 'http://lucene.apache.org/core/':
# OK
pass
elif link == 'http://lucene.apache.org/solr/':
# OK
pass
elif link == 'http://lucene.apache.org/solr/resources.html':
# OK
pass
elif link.find('lucene.apache.org/java/docs/discussion.html') != -1:
# OK
pass
elif link.find('lucene.apache.org/core/discussion.html') != -1:
# OK
pass
elif link.find('lucene.apache.org/solr/mirrors-solr-latest-redir.html') != -1:
# OK
pass
elif link.find('lucene.apache.org/solr/quickstart.html') != -1:
# OK
pass
elif (link.find('svn.apache.org') != -1
or link.find('lucene.apache.org') != -1)\
and os.path.basename(fullPath) != 'Changes.html':
if not printed:
printed = True
print()
print(fullPath)
print(' BAD EXTERNAL LINK: %s' % link)
elif link.startswith('mailto:'):
if link.find('@lucene.apache.org') == -1 and link.find('@apache.org') != -1:
if not printed:
printed = True
print()
print(fullPath)
print(' BROKEN MAILTO (?): %s' % link)
elif link.startswith('javascript:'):
# ok...?
pass
elif 'org/apache/solr/client/solrj/beans/Field.html' in link:
# see LUCENE-4011: this is a javadocs bug for constants
# on annotations it seems?
pass
elif link.startswith('file:'):
if link not in allFiles:
filepath = urlparse.unquote(urlparse.urlparse(link).path)
if not (os.path.exists(filepath) or os.path.exists(filepath[1:])):
if not printed:
printed = True
print()
print(fullPath)
print(' BROKEN LINK: %s' % link)
elif anchor is not None and anchor not in allFiles[link][1]:
if not printed:
printed = True
print()
print(fullPath)
print(' BROKEN ANCHOR: %s' % origLink)
else:
if not printed:
printed = True
print()
print(fullPath)
print(' BROKEN URL SCHEME: %s' % origLink)
failures = failures or printed
return failures
if __name__ == '__main__':
if checkAll(sys.argv[1]):
print()
print('Broken javadocs links were found!')
sys.exit(1)
sys.exit(0)