2012-04-22 16:08:53 -04:00
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
2012-04-22 15:41:57 -04:00
|
|
|
import traceback
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import re
|
2012-07-03 11:50:14 -04:00
|
|
|
from html.parser import HTMLParser, HTMLParseError
|
|
|
|
import urllib.parse as urlparse
|
2012-04-22 15:41:57 -04:00
|
|
|
|
|
|
|
reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
|
|
|
|
reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)
|
|
|
|
|
2012-09-06 14:37:38 -04:00
|
|
|
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
|
2012-09-10 11:49:30 -04:00
|
|
|
reValidChar = re.compile("^[^\u0000-\u0008\u000B-\u000C\u000E-\u001F\uFFFE\uFFFF]*$")
|
2012-09-06 14:37:38 -04:00
|
|
|
|
2012-04-22 15:41:57 -04:00
|
|
|
# silly emacs: '
|
|
|
|
|
|
|
|
class FindHyperlinks(HTMLParser):
|
|
|
|
|
|
|
|
def __init__(self, baseURL):
|
|
|
|
HTMLParser.__init__(self)
|
2012-10-24 23:30:04 -04:00
|
|
|
self.stack = []
|
2012-04-22 15:41:57 -04:00
|
|
|
self.anchors = set()
|
|
|
|
self.links = []
|
|
|
|
self.baseURL = baseURL
|
|
|
|
self.printed = False
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
2012-10-24 23:30:04 -04:00
|
|
|
# NOTE: I don't think 'a' should be in here. But try debugging
|
2014-12-24 00:48:58 -05:00
|
|
|
# NumericRangeQuery.html. (Could be javadocs bug, it's a generic type...)
|
2012-10-24 23:30:04 -04:00
|
|
|
if tag not in ('link', 'meta', 'frame', 'br', 'hr', 'p', 'li', 'img', 'col', 'a'):
|
|
|
|
self.stack.append(tag)
|
2012-04-22 15:41:57 -04:00
|
|
|
if tag == 'a':
|
|
|
|
name = None
|
|
|
|
href = None
|
|
|
|
for attName, attValue in attrs:
|
|
|
|
if attName == 'name':
|
|
|
|
name = attValue
|
|
|
|
elif attName == 'href':
|
|
|
|
href = attValue
|
|
|
|
|
|
|
|
if name is not None:
|
|
|
|
assert href is None
|
|
|
|
if name in self.anchors:
|
|
|
|
if name in ('serializedForm',
|
|
|
|
'serialized_methods',
|
|
|
|
'readObject(java.io.ObjectInputStream)',
|
|
|
|
'writeObject(java.io.ObjectOutputStream)') \
|
|
|
|
and self.baseURL.endswith('/serialized-form.html'):
|
|
|
|
# Seems like a bug in Javadoc generation... you can't have
|
|
|
|
# same anchor name more than once...
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
self.printFile()
|
2013-05-17 06:48:20 -04:00
|
|
|
raise RuntimeError('anchor "%s" appears more than once' % name)
|
2012-04-22 15:41:57 -04:00
|
|
|
else:
|
|
|
|
self.anchors.add(name)
|
|
|
|
elif href is not None:
|
|
|
|
assert name is None
|
2012-05-06 06:08:07 -04:00
|
|
|
href = href.strip()
|
2012-04-22 15:41:57 -04:00
|
|
|
self.links.append(urlparse.urljoin(self.baseURL, href))
|
|
|
|
else:
|
2012-07-26 18:45:31 -04:00
|
|
|
if self.baseURL.endswith('/AttributeSource.html'):
|
2012-04-22 15:41:57 -04:00
|
|
|
# LUCENE-4010: AttributeSource's javadocs has an unescaped <A> generics!! Seems to be a javadocs bug... (fixed in Java 7)
|
|
|
|
pass
|
|
|
|
else:
|
2012-04-25 15:24:05 -04:00
|
|
|
raise RuntimeError('couldn\'t find an href nor name in link in %s: only got these attrs: %s' % (self.baseURL, attrs))
|
2012-04-22 15:41:57 -04:00
|
|
|
|
2012-10-24 23:30:04 -04:00
|
|
|
def handle_endtag(self, tag):
|
|
|
|
if tag in ('link', 'meta', 'frame', 'br', 'hr', 'p', 'li', 'img', 'col', 'a'):
|
|
|
|
return
|
|
|
|
|
|
|
|
if len(self.stack) == 0:
|
|
|
|
raise RuntimeError('%s %s:%s: saw </%s> no opening <%s>' % (self.baseURL, self.getpos()[0], self.getpos()[1], tag, self.stack[-1]))
|
|
|
|
|
|
|
|
if self.stack[-1] == tag:
|
|
|
|
self.stack.pop()
|
|
|
|
else:
|
|
|
|
raise RuntimeError('%s %s:%s: saw </%s> but expected </%s>' % (self.baseURL, self.getpos()[0], self.getpos()[1], tag, self.stack[-1]))
|
|
|
|
|
2012-04-22 15:41:57 -04:00
|
|
|
def printFile(self):
|
|
|
|
if not self.printed:
|
2012-07-03 11:50:14 -04:00
|
|
|
print()
|
|
|
|
print(' ' + self.baseURL)
|
2012-04-22 15:41:57 -04:00
|
|
|
self.printed = True
|
|
|
|
|
|
|
|
def parse(baseURL, html):
|
2012-04-22 16:08:53 -04:00
|
|
|
global failures
|
2012-09-06 14:37:38 -04:00
|
|
|
# look for broken unicode
|
|
|
|
if not reValidChar.match(html):
|
|
|
|
print(' WARNING: invalid characters detected in: %s' % baseURL)
|
|
|
|
failures = True
|
|
|
|
return [], []
|
|
|
|
|
2012-04-22 15:41:57 -04:00
|
|
|
parser = FindHyperlinks(baseURL)
|
|
|
|
try:
|
|
|
|
parser.feed(html)
|
|
|
|
parser.close()
|
|
|
|
except HTMLParseError:
|
|
|
|
parser.printFile()
|
2012-07-03 11:50:14 -04:00
|
|
|
print(' WARNING: failed to parse %s:' % baseURL)
|
|
|
|
traceback.print_exc(file=sys.stdout)
|
2012-04-22 16:08:53 -04:00
|
|
|
failures = True
|
2012-04-22 15:41:57 -04:00
|
|
|
return [], []
|
|
|
|
|
|
|
|
#print ' %d links, %d anchors' % \
|
|
|
|
# (len(parser.links), len(parser.anchors))
|
|
|
|
return parser.links, parser.anchors
|
|
|
|
|
2012-04-22 16:08:53 -04:00
|
|
|
failures = False
|
|
|
|
|
2012-04-22 15:41:57 -04:00
|
|
|
def checkAll(dirName):
|
|
|
|
"""
|
|
|
|
Checks *.html (recursively) under this directory.
|
|
|
|
"""
|
|
|
|
|
2012-04-22 16:08:53 -04:00
|
|
|
global failures
|
|
|
|
|
2012-04-22 15:41:57 -04:00
|
|
|
# Find/parse all HTML files first
|
2012-07-03 11:50:14 -04:00
|
|
|
print()
|
|
|
|
print('Crawl/parse...')
|
2012-04-22 15:41:57 -04:00
|
|
|
allFiles = {}
|
|
|
|
|
|
|
|
if os.path.isfile(dirName):
|
|
|
|
root, fileName = os.path.split(dirName)
|
|
|
|
iter = ((root, [], [fileName]),)
|
|
|
|
else:
|
|
|
|
iter = os.walk(dirName)
|
|
|
|
|
|
|
|
for root, dirs, files in iter:
|
|
|
|
for f in files:
|
|
|
|
main, ext = os.path.splitext(f)
|
|
|
|
ext = ext.lower()
|
|
|
|
|
|
|
|
# maybe?:
|
|
|
|
# and main not in ('serialized-form'):
|
|
|
|
if ext in ('.htm', '.html') and \
|
|
|
|
not f.startswith('.#') and \
|
|
|
|
main not in ('deprecated-list',):
|
|
|
|
# Somehow even w/ java 7 generaged javadocs,
|
|
|
|
# deprecated-list.html can fail to escape generics types
|
2012-07-26 18:45:31 -04:00
|
|
|
fullPath = os.path.join(root, f).replace(os.path.sep,'/')
|
2012-09-21 08:12:09 -04:00
|
|
|
fullPath = 'file:%s' % urlparse.quote(fullPath)
|
|
|
|
# parse and unparse the URL to "normalize" it
|
|
|
|
fullPath = urlparse.urlunparse(urlparse.urlparse(fullPath))
|
2012-04-22 15:41:57 -04:00
|
|
|
#print ' %s' % fullPath
|
2012-07-03 12:48:07 -04:00
|
|
|
allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f), encoding='UTF-8').read())
|
2012-04-22 15:41:57 -04:00
|
|
|
|
|
|
|
# ... then verify:
|
2012-07-03 11:50:14 -04:00
|
|
|
print()
|
|
|
|
print('Verify...')
|
2012-04-22 15:41:57 -04:00
|
|
|
for fullPath, (links, anchors) in allFiles.items():
|
|
|
|
#print fullPath
|
|
|
|
printed = False
|
|
|
|
for link in links:
|
|
|
|
|
|
|
|
origLink = link
|
|
|
|
|
|
|
|
# TODO: use urlparse?
|
|
|
|
idx = link.find('#')
|
|
|
|
if idx != -1:
|
|
|
|
anchor = link[idx+1:]
|
|
|
|
link = link[:idx]
|
|
|
|
else:
|
|
|
|
anchor = None
|
|
|
|
|
2014-09-24 16:18:31 -04:00
|
|
|
# remove any whitespace from the middle of the link
|
|
|
|
link = ''.join(link.split())
|
|
|
|
|
2012-04-22 15:41:57 -04:00
|
|
|
idx = link.find('?')
|
|
|
|
if idx != -1:
|
|
|
|
link = link[:idx]
|
|
|
|
|
|
|
|
# TODO: normalize path sep for windows...
|
|
|
|
if link.startswith('http://') or link.startswith('https://'):
|
|
|
|
# don't check external links
|
2012-04-22 17:03:06 -04:00
|
|
|
|
|
|
|
if link.find('lucene.apache.org/java/docs/mailinglists.html') != -1:
|
|
|
|
# OK
|
|
|
|
pass
|
2012-09-20 20:10:04 -04:00
|
|
|
elif link == 'http://lucene.apache.org/core/':
|
|
|
|
# OK
|
|
|
|
pass
|
|
|
|
elif link == 'http://lucene.apache.org/solr/':
|
|
|
|
# OK
|
|
|
|
pass
|
2012-04-22 17:03:06 -04:00
|
|
|
elif link.find('lucene.apache.org/java/docs/discussion.html') != -1:
|
|
|
|
# OK
|
|
|
|
pass
|
2013-01-14 12:40:35 -05:00
|
|
|
elif link.find('lucene.apache.org/core/discussion.html') != -1:
|
|
|
|
# OK
|
|
|
|
pass
|
2012-04-23 09:44:03 -04:00
|
|
|
elif link.find('lucene.apache.org/solr/mirrors-solr-latest-redir.html') != -1:
|
|
|
|
# OK
|
|
|
|
pass
|
|
|
|
elif link.find('lucene.apache.org/solr/discussion.html') != -1:
|
|
|
|
# OK
|
|
|
|
pass
|
|
|
|
elif link.find('lucene.apache.org/solr/features.html') != -1:
|
|
|
|
# OK
|
|
|
|
pass
|
2012-06-09 22:50:46 -04:00
|
|
|
elif (link.find('svn.apache.org') != -1
|
|
|
|
or link.find('lucene.apache.org') != -1)\
|
|
|
|
and os.path.basename(fullPath) != 'Changes.html':
|
2012-04-22 17:03:06 -04:00
|
|
|
if not printed:
|
|
|
|
printed = True
|
2012-07-03 11:50:14 -04:00
|
|
|
print()
|
|
|
|
print(fullPath)
|
|
|
|
print(' BAD EXTERNAL LINK: %s' % link)
|
2012-04-22 18:53:20 -04:00
|
|
|
elif link.startswith('mailto:'):
|
2012-04-22 17:03:06 -04:00
|
|
|
if link.find('@lucene.apache.org') == -1 and link.find('@apache.org') != -1:
|
|
|
|
if not printed:
|
|
|
|
printed = True
|
2012-07-03 11:50:14 -04:00
|
|
|
print()
|
|
|
|
print(fullPath)
|
|
|
|
print(' BROKEN MAILTO (?): %s' % link)
|
2012-04-22 18:53:20 -04:00
|
|
|
elif link.startswith('javascript:'):
|
|
|
|
# ok...?
|
|
|
|
pass
|
2012-04-23 11:08:27 -04:00
|
|
|
elif 'org/apache/solr/client/solrj/beans/Field.html' in link:
|
|
|
|
# see LUCENE-4011: this is a javadocs bug for constants
|
|
|
|
# on annotations it seems?
|
|
|
|
pass
|
2012-07-26 16:24:56 -04:00
|
|
|
elif link.startswith('file:'):
|
2012-09-21 08:12:09 -04:00
|
|
|
if link not in allFiles:
|
|
|
|
filepath = urlparse.unquote(urlparse.urlparse(link).path)
|
|
|
|
if not (os.path.exists(filepath) or os.path.exists(filepath[1:])):
|
|
|
|
if not printed:
|
|
|
|
printed = True
|
|
|
|
print()
|
|
|
|
print(fullPath)
|
|
|
|
print(' BROKEN LINK: %s' % link)
|
2012-04-22 15:41:57 -04:00
|
|
|
elif anchor is not None and anchor not in allFiles[link][1]:
|
|
|
|
if not printed:
|
|
|
|
printed = True
|
2012-07-03 11:50:14 -04:00
|
|
|
print()
|
|
|
|
print(fullPath)
|
|
|
|
print(' BROKEN ANCHOR: %s' % origLink)
|
2012-09-21 08:12:09 -04:00
|
|
|
else:
|
|
|
|
if not printed:
|
|
|
|
printed = True
|
|
|
|
print()
|
|
|
|
print(fullPath)
|
|
|
|
print(' BROKEN URL SCHEME: %s' % origLink)
|
2012-04-22 16:08:53 -04:00
|
|
|
failures = failures or printed
|
2012-06-08 06:18:09 -04:00
|
|
|
|
|
|
|
return failures
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
if checkAll(sys.argv[1]):
|
2012-07-03 11:50:14 -04:00
|
|
|
print()
|
|
|
|
print('Broken javadocs links were found!')
|
2012-04-22 16:08:53 -04:00
|
|
|
sys.exit(1)
|
2012-06-08 06:18:09 -04:00
|
|
|
sys.exit(0)
|
2012-04-22 15:41:57 -04:00
|
|
|
|