lucene/dev-tools/scripts/checkJavadocLinks.py

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import traceback
import os
import sys
import re
from HTMLParser import HTMLParser, HTMLParseError
import urlparse

reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)

# silly emacs: '

class FindHyperlinks(HTMLParser):

  def __init__(self, baseURL):
    HTMLParser.__init__(self)
    self.anchors = set()
    self.links = []
    self.baseURL = baseURL
    self.printed = False

  def handle_starttag(self, tag, attrs):
    if tag == 'a':
      name = None
      href = None
      for attName, attValue in attrs:
        if attName == 'name':
          name = attValue
        elif attName == 'href':
          href = attValue

      if name is not None:
        assert href is None
        if name in self.anchors:
          if name in ('serializedForm',
                      'serialized_methods',
                      'readObject(java.io.ObjectInputStream)',
                      'writeObject(java.io.ObjectOutputStream)') \
                      and self.baseURL.endswith('/serialized-form.html'):
            # Seems like a bug in Javadoc generation... you can't have
            # same anchor name more than once...
            pass
          else:
            self.printFile()
            print '    WARNING: anchor "%s" appears more than once' % name
        else:
          self.anchors.add(name)
      elif href is not None:
        assert name is None
        self.links.append(urlparse.urljoin(self.baseURL, href))
      else:
        if self.baseURL.endswith('/AttributeSource.html'):
          # LUCENE-4010: AttributeSource's javadocs has an unescaped <A> generics!!  Seems to be a javadocs bug... (fixed in Java 7)
          pass
        else:
          raise RuntimeError('BUG: %s' % attrs)

  def printFile(self):
    if not self.printed:
      print
      print '  ' + self.baseURL
      self.printed = True
                   
def parse(baseURL, html):
  global failures
  parser = FindHyperlinks(baseURL)
  try:
    parser.feed(html)
    parser.close()
  except HTMLParseError:
    parser.printFile()
    print '  WARNING: failed to parse:'
    traceback.print_exc()
    failures = True
    return [], []
  
  #print '    %d links, %d anchors' % \
  #      (len(parser.links), len(parser.anchors))
  return parser.links, parser.anchors

failures = False

def checkAll(dirName):
  """
  Checks *.html (recursively) under this directory.
  """

  global failures

  # Find/parse all HTML files first
  print
  print 'Crawl/parse...'
  allFiles = {}

  if os.path.isfile(dirName):
    root, fileName = os.path.split(dirName)
    iter = ((root, [], [fileName]),)
  else:
    iter = os.walk(dirName)

  for root, dirs, files in iter:
    for f in files:
      main, ext = os.path.splitext(f)
      ext = ext.lower()

      # maybe?:
      # and main not in ('serialized-form'):
      if ext in ('.htm', '.html') and \
         not f.startswith('.#') and \
         main not in ('deprecated-list',):
        # Somehow even w/ java 7 generaged javadocs,
        # deprecated-list.html can fail to escape generics types
        fullPath = os.path.join(root, f)
        #print '  %s' % fullPath
        allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f)).read())

  # ... then verify:
  print
  print 'Verify...'
  for fullPath, (links, anchors) in allFiles.items():
    #print fullPath
    printed = False
    for link in links:

      origLink = link

      # TODO: use urlparse?
      idx = link.find('#')
      if idx != -1:
        anchor = link[idx+1:]
        link = link[:idx]
      else:
        anchor = None

      idx = link.find('?')
      if idx != -1:
        link = link[:idx]
        
      # TODO: normalize path sep for windows...
      if link.startswith('http://') or link.startswith('https://'):
        # don't check external links

        if link.find('lucene.apache.org/java/docs/mailinglists.html') != -1:
          # OK
          pass
        elif link.find('lucene.apache.org/java/docs/discussion.html') != -1:
          # OK
          pass
        elif link.find('svn.apache.org') != -1 or link.find('lucene.apache.org') != -1:
          if not printed:
            printed = True
            print
            print fullPath
          print '  BAD EXTERNAL LINK: %s' % link
      elif link.startswith('mailto:'):
        if link.find('@lucene.apache.org') == -1 and link.find('@apache.org') != -1:
          if not printed:
            printed = True
            print
            print fullPath
          print '  BROKEN MAILTO (?): %s' % link
      elif link.startswith('javascript:'):
        # ok...?
        pass
      elif link not in allFiles:
        # We only load HTML... so if the link is another resource (eg
        # SweetSpotSimilarity refs
        # lucene/build/docs/misc/org/apache/lucene/misc/doc-files/ss.gnuplot) then it's OK:
        if not os.path.exists(link):
          if not printed:
            printed = True
            print
            print fullPath
          print '  BROKEN LINK: %s' % link
      elif anchor is not None and anchor not in allFiles[link][1]:
        if not printed:
          printed = True
          print
          print fullPath
        print '  BROKEN ANCHOR: %s' % origLink

    failures = failures or printed
    
  if failures:
    sys.exit(1)
  else:
    sys.exit(0)
        
if __name__ == '__main__':
  checkAll(sys.argv[1])
improve broken link checker; fix some broken links git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328962 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 16:08:53 -04:00			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

add simple python script to find broken javadocs links; fix some HTML escaping in javadocs git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328949 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 15:41:57 -04:00			`import traceback`
			`import os`
			`import sys`
			`import re`
			`from HTMLParser import HTMLParser, HTMLParseError`
			`import urlparse`

			`reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)`
			`reAtt = re.compile(r"""(?:\s+([a-z]+)\s=\s("[^"]*"\|'[^']?'\|[^'"\s]+))+""", re.I)`

			`# silly emacs: '`

			`class FindHyperlinks(HTMLParser):`

			`def __init__(self, baseURL):`
			`HTMLParser.__init__(self)`
			`self.anchors = set()`
			`self.links = []`
			`self.baseURL = baseURL`
			`self.printed = False`

			`def handle_starttag(self, tag, attrs):`
			`if tag == 'a':`
			`name = None`
			`href = None`
			`for attName, attValue in attrs:`
			`if attName == 'name':`
			`name = attValue`
			`elif attName == 'href':`
			`href = attValue`

			`if name is not None:`
			`assert href is None`
			`if name in self.anchors:`
			`if name in ('serializedForm',`
			`'serialized_methods',`
			`'readObject(java.io.ObjectInputStream)',`
			`'writeObject(java.io.ObjectOutputStream)') \`
			`and self.baseURL.endswith('/serialized-form.html'):`
			`# Seems like a bug in Javadoc generation... you can't have`
			`# same anchor name more than once...`
			`pass`
			`else:`
			`self.printFile()`
			`print ' WARNING: anchor "%s" appears more than once' % name`
			`else:`
			`self.anchors.add(name)`
			`elif href is not None:`
			`assert name is None`
			`self.links.append(urlparse.urljoin(self.baseURL, href))`
			`else:`
			`if self.baseURL.endswith('/AttributeSource.html'):`
			`# LUCENE-4010: AttributeSource's javadocs has an unescaped <A> generics!! Seems to be a javadocs bug... (fixed in Java 7)`
			`pass`
			`else:`
			`raise RuntimeError('BUG: %s' % attrs)`

			`def printFile(self):`
			`if not self.printed:`
			`print`
			`print ' ' + self.baseURL`
			`self.printed = True`

			`def parse(baseURL, html):`
improve broken link checker; fix some broken links git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328962 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 16:08:53 -04:00			`global failures`
add simple python script to find broken javadocs links; fix some HTML escaping in javadocs git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328949 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 15:41:57 -04:00			`parser = FindHyperlinks(baseURL)`
			`try:`
			`parser.feed(html)`
			`parser.close()`
			`except HTMLParseError:`
			`parser.printFile()`
			`print ' WARNING: failed to parse:'`
			`traceback.print_exc()`
improve broken link checker; fix some broken links git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328962 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 16:08:53 -04:00			`failures = True`
add simple python script to find broken javadocs links; fix some HTML escaping in javadocs git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328949 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 15:41:57 -04:00			`return [], []`

			`#print ' %d links, %d anchors' % \`
			`# (len(parser.links), len(parser.anchors))`
			`return parser.links, parser.anchors`

improve broken link checker; fix some broken links git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328962 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 16:08:53 -04:00			`failures = False`

add simple python script to find broken javadocs links; fix some HTML escaping in javadocs git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328949 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 15:41:57 -04:00			`def checkAll(dirName):`
			`"""`
			`Checks *.html (recursively) under this directory.`
			`"""`

improve broken link checker; fix some broken links git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328962 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 16:08:53 -04:00			`global failures`

add simple python script to find broken javadocs links; fix some HTML escaping in javadocs git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328949 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 15:41:57 -04:00			`# Find/parse all HTML files first`
			`print`
			`print 'Crawl/parse...'`
			`allFiles = {}`

			`if os.path.isfile(dirName):`
			`root, fileName = os.path.split(dirName)`
			`iter = ((root, [], [fileName]),)`
			`else:`
			`iter = os.walk(dirName)`

			`for root, dirs, files in iter:`
			`for f in files:`
			`main, ext = os.path.splitext(f)`
			`ext = ext.lower()`

			`# maybe?:`
			`# and main not in ('serialized-form'):`
			`if ext in ('.htm', '.html') and \`
			`not f.startswith('.#') and \`
			`main not in ('deprecated-list',):`
			`# Somehow even w/ java 7 generaged javadocs,`
			`# deprecated-list.html can fail to escape generics types`
			`fullPath = os.path.join(root, f)`
			`#print ' %s' % fullPath`
			`allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f)).read())`

			`# ... then verify:`
			`print`
			`print 'Verify...'`
			`for fullPath, (links, anchors) in allFiles.items():`
			`#print fullPath`
			`printed = False`
			`for link in links:`

			`origLink = link`

			`# TODO: use urlparse?`
			`idx = link.find('#')`
			`if idx != -1:`
			`anchor = link[idx+1:]`
			`link = link[:idx]`
			`else:`
			`anchor = None`

			`idx = link.find('?')`
			`if idx != -1:`
			`link = link[:idx]`

			`# TODO: normalize path sep for windows...`
			`if link.startswith('http://') or link.startswith('https://'):`
			`# don't check external links`
fix some more broken links... git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328975 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 17:03:06 -04:00
			`if link.find('lucene.apache.org/java/docs/mailinglists.html') != -1:`
			`# OK`
			`pass`
			`elif link.find('lucene.apache.org/java/docs/discussion.html') != -1:`
			`# OK`
			`pass`
			`elif link.find('svn.apache.org') != -1 or link.find('lucene.apache.org') != -1:`
			`if not printed:`
			`printed = True`
			`print`
			`print fullPath`
			`print ' BAD EXTERNAL LINK: %s' % link`
fix mailto and pass javascript links git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328988 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 18:53:20 -04:00			`elif link.startswith('mailto:'):`
fix some more broken links... git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328975 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 17:03:06 -04:00			`if link.find('@lucene.apache.org') == -1 and link.find('@apache.org') != -1:`
			`if not printed:`
			`printed = True`
			`print`
			`print fullPath`
			`print ' BROKEN MAILTO (?): %s' % link`
fix mailto and pass javascript links git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328988 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 18:53:20 -04:00			`elif link.startswith('javascript:'):`
			`# ok...?`
			`pass`
add simple python script to find broken javadocs links; fix some HTML escaping in javadocs git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328949 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 15:41:57 -04:00			`elif link not in allFiles:`
			`# We only load HTML... so if the link is another resource (eg`
			`# SweetSpotSimilarity refs`
			`# lucene/build/docs/misc/org/apache/lucene/misc/doc-files/ss.gnuplot) then it's OK:`
			`if not os.path.exists(link):`
			`if not printed:`
			`printed = True`
			`print`
			`print fullPath`
			`print ' BROKEN LINK: %s' % link`
			`elif anchor is not None and anchor not in allFiles[link][1]:`
			`if not printed:`
			`printed = True`
			`print`
			`print fullPath`
			`print ' BROKEN ANCHOR: %s' % origLink`
improve broken link checker; fix some broken links git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328962 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 16:08:53 -04:00
			`failures = failures or printed`

			`if failures:`
			`sys.exit(1)`
			`else:`
			`sys.exit(0)`
add simple python script to find broken javadocs links; fix some HTML escaping in javadocs git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328949 13f79535-47bb-0310-9956-ffa450edef68 2012-04-22 15:41:57 -04:00
			`if __name__ == '__main__':`
			`checkAll(sys.argv[1])`