# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import traceback import os import sys import re from html.parser import HTMLParser, HTMLParseError import urllib.parse as urlparse reHyperlink = re.compile(r'', re.I) reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I) # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */ reValidChar = re.compile("^[^\u0000-\u0008\u000B-\u000C\u000E-\u001F\uFFFE\uFFFF]*$") # silly emacs: ' class FindHyperlinks(HTMLParser): def __init__(self, baseURL): HTMLParser.__init__(self) self.stack = [] self.anchors = set() self.links = [] self.baseURL = baseURL self.printed = False def handle_starttag(self, tag, attrs): # NOTE: I don't think 'a' should be in here. But try debugging # NumericRangeQuery.html. (Could be javadocs bug, its a generic type...) if tag not in ('link', 'meta', 'frame', 'br', 'hr', 'p', 'li', 'img', 'col', 'a'): self.stack.append(tag) if tag == 'a': name = None href = None for attName, attValue in attrs: if attName == 'name': name = attValue elif attName == 'href': href = attValue if name is not None: assert href is None if name in self.anchors: if name in ('serializedForm', 'serialized_methods', 'readObject(java.io.ObjectInputStream)', 'writeObject(java.io.ObjectOutputStream)') \ and self.baseURL.endswith('/serialized-form.html'): # Seems like a bug in Javadoc generation... you can't have # same anchor name more than once... pass else: self.printFile() print(' WARNING: anchor "%s" appears more than once' % name) else: self.anchors.add(name) elif href is not None: assert name is None href = href.strip() self.links.append(urlparse.urljoin(self.baseURL, href)) else: if self.baseURL.endswith('/AttributeSource.html'): # LUCENE-4010: AttributeSource's javadocs has an unescaped generics!! Seems to be a javadocs bug... (fixed in Java 7) pass else: raise RuntimeError('couldn\'t find an href nor name in link in %s: only got these attrs: %s' % (self.baseURL, attrs)) def handle_endtag(self, tag): if tag in ('link', 'meta', 'frame', 'br', 'hr', 'p', 'li', 'img', 'col', 'a'): return if len(self.stack) == 0: raise RuntimeError('%s %s:%s: saw no opening <%s>' % (self.baseURL, self.getpos()[0], self.getpos()[1], tag, self.stack[-1])) if self.stack[-1] == tag: self.stack.pop() else: raise RuntimeError('%s %s:%s: saw but expected ' % (self.baseURL, self.getpos()[0], self.getpos()[1], tag, self.stack[-1])) def printFile(self): if not self.printed: print() print(' ' + self.baseURL) self.printed = True def parse(baseURL, html): global failures # look for broken unicode if not reValidChar.match(html): print(' WARNING: invalid characters detected in: %s' % baseURL) failures = True return [], [] parser = FindHyperlinks(baseURL) try: parser.feed(html) parser.close() except HTMLParseError: parser.printFile() print(' WARNING: failed to parse %s:' % baseURL) traceback.print_exc(file=sys.stdout) failures = True return [], [] #print ' %d links, %d anchors' % \ # (len(parser.links), len(parser.anchors)) return parser.links, parser.anchors failures = False def checkAll(dirName): """ Checks *.html (recursively) under this directory. """ global failures # Find/parse all HTML files first print() print('Crawl/parse...') allFiles = {} if os.path.isfile(dirName): root, fileName = os.path.split(dirName) iter = ((root, [], [fileName]),) else: iter = os.walk(dirName) for root, dirs, files in iter: for f in files: main, ext = os.path.splitext(f) ext = ext.lower() # maybe?: # and main not in ('serialized-form'): if ext in ('.htm', '.html') and \ not f.startswith('.#') and \ main not in ('deprecated-list',): # Somehow even w/ java 7 generaged javadocs, # deprecated-list.html can fail to escape generics types fullPath = os.path.join(root, f).replace(os.path.sep,'/') fullPath = 'file:%s' % urlparse.quote(fullPath) # parse and unparse the URL to "normalize" it fullPath = urlparse.urlunparse(urlparse.urlparse(fullPath)) #print ' %s' % fullPath allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f), encoding='UTF-8').read()) # ... then verify: print() print('Verify...') for fullPath, (links, anchors) in allFiles.items(): #print fullPath printed = False for link in links: origLink = link # TODO: use urlparse? idx = link.find('#') if idx != -1: anchor = link[idx+1:] link = link[:idx] else: anchor = None idx = link.find('?') if idx != -1: link = link[:idx] # TODO: normalize path sep for windows... if link.startswith('http://') or link.startswith('https://'): # don't check external links if link.find('lucene.apache.org/java/docs/mailinglists.html') != -1: # OK pass elif link == 'http://lucene.apache.org/core/': # OK pass elif link == 'http://lucene.apache.org/solr/': # OK pass elif link.find('lucene.apache.org/java/docs/discussion.html') != -1: # OK pass elif link.find('lucene.apache.org/core/discussion.html') != -1: # OK pass elif link.find('lucene.apache.org/solr/mirrors-solr-latest-redir.html') != -1: # OK pass elif link.find('lucene.apache.org/solr/discussion.html') != -1: # OK pass elif link.find('lucene.apache.org/solr/features.html') != -1: # OK pass elif (link.find('svn.apache.org') != -1 or link.find('lucene.apache.org') != -1)\ and os.path.basename(fullPath) != 'Changes.html': if not printed: printed = True print() print(fullPath) print(' BAD EXTERNAL LINK: %s' % link) elif link.startswith('mailto:'): if link.find('@lucene.apache.org') == -1 and link.find('@apache.org') != -1: if not printed: printed = True print() print(fullPath) print(' BROKEN MAILTO (?): %s' % link) elif link.startswith('javascript:'): # ok...? pass elif 'org/apache/solr/client/solrj/beans/Field.html' in link: # see LUCENE-4011: this is a javadocs bug for constants # on annotations it seems? pass elif link.startswith('file:'): if link not in allFiles: filepath = urlparse.unquote(urlparse.urlparse(link).path) if not (os.path.exists(filepath) or os.path.exists(filepath[1:])): if not printed: printed = True print() print(fullPath) print(' BROKEN LINK: %s' % link) elif anchor is not None and anchor not in allFiles[link][1]: if not printed: printed = True print() print(fullPath) print(' BROKEN ANCHOR: %s' % origLink) else: if not printed: printed = True print() print(fullPath) print(' BROKEN URL SCHEME: %s' % origLink) failures = failures or printed return failures if __name__ == '__main__': if checkAll(sys.argv[1]): print() print('Broken javadocs links were found!') sys.exit(1) sys.exit(0)