# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import traceback import os import sys import re from HTMLParser import HTMLParser, HTMLParseError import urlparse reHyperlink = re.compile(r'', re.I) reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I) # silly emacs: ' class FindHyperlinks(HTMLParser): def __init__(self, baseURL): HTMLParser.__init__(self) self.anchors = set() self.links = [] self.baseURL = baseURL self.printed = False def handle_starttag(self, tag, attrs): if tag == 'a': name = None href = None for attName, attValue in attrs: if attName == 'name': name = attValue elif attName == 'href': href = attValue if name is not None: assert href is None if name in self.anchors: if name in ('serializedForm', 'serialized_methods', 'readObject(java.io.ObjectInputStream)', 'writeObject(java.io.ObjectOutputStream)') \ and self.baseURL.endswith('/serialized-form.html'): # Seems like a bug in Javadoc generation... you can't have # same anchor name more than once... pass else: self.printFile() print ' WARNING: anchor "%s" appears more than once' % name else: self.anchors.add(name) elif href is not None: assert name is None self.links.append(urlparse.urljoin(self.baseURL, href)) else: if self.baseURL.endswith('/AttributeSource.html'): # LUCENE-4010: AttributeSource's javadocs has an unescaped generics!! Seems to be a javadocs bug... (fixed in Java 7) pass else: raise RuntimeError('BUG: %s' % attrs) def printFile(self): if not self.printed: print print ' ' + self.baseURL self.printed = True def parse(baseURL, html): global failures parser = FindHyperlinks(baseURL) try: parser.feed(html) parser.close() except HTMLParseError: parser.printFile() print ' WARNING: failed to parse:' traceback.print_exc() failures = True return [], [] #print ' %d links, %d anchors' % \ # (len(parser.links), len(parser.anchors)) return parser.links, parser.anchors failures = False def checkAll(dirName): """ Checks *.html (recursively) under this directory. """ global failures # Find/parse all HTML files first print print 'Crawl/parse...' allFiles = {} if os.path.isfile(dirName): root, fileName = os.path.split(dirName) iter = ((root, [], [fileName]),) else: iter = os.walk(dirName) for root, dirs, files in iter: for f in files: main, ext = os.path.splitext(f) ext = ext.lower() # maybe?: # and main not in ('serialized-form'): if ext in ('.htm', '.html') and \ not f.startswith('.#') and \ main not in ('deprecated-list',): # Somehow even w/ java 7 generaged javadocs, # deprecated-list.html can fail to escape generics types fullPath = os.path.join(root, f) #print ' %s' % fullPath allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f)).read()) # ... then verify: print print 'Verify...' for fullPath, (links, anchors) in allFiles.items(): #print fullPath printed = False for link in links: origLink = link # TODO: use urlparse? idx = link.find('#') if idx != -1: anchor = link[idx+1:] link = link[:idx] else: anchor = None idx = link.find('?') if idx != -1: link = link[:idx] # TODO: normalize path sep for windows... if link.startswith('http://') or link.startswith('https://'): # don't check external links if link.find('lucene.apache.org/java/docs/mailinglists.html') != -1: # OK pass elif link.find('lucene.apache.org/java/docs/discussion.html') != -1: # OK pass elif link.find('svn.apache.org') != -1 or link.find('lucene.apache.org') != -1: if not printed: printed = True print print fullPath print ' BAD EXTERNAL LINK: %s' % link elif link.startswith('mailto:'): if link.find('@lucene.apache.org') == -1 and link.find('@apache.org') != -1: if not printed: printed = True print print fullPath print ' BROKEN MAILTO (?): %s' % link elif link.startswith('javascript:'): # ok...? pass elif link not in allFiles: # We only load HTML... so if the link is another resource (eg # SweetSpotSimilarity refs # lucene/build/docs/misc/org/apache/lucene/misc/doc-files/ss.gnuplot) then it's OK: if not os.path.exists(link): if not printed: printed = True print print fullPath print ' BROKEN LINK: %s' % link elif anchor is not None and anchor not in allFiles[link][1]: if not printed: printed = True print print fullPath print ' BROKEN ANCHOR: %s' % origLink failures = failures or printed if failures: sys.exit(1) else: sys.exit(0) if __name__ == '__main__': checkAll(sys.argv[1])