use python3.2 for javadocs link checking (its builtin HTML parser is more strict than 2.7's)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1356797 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-07-03 15:50:14 +00:00
parent 0f808c6bcd
commit dae7f3e56a
2 changed files with 27 additions and 26 deletions

View File

@ -17,8 +17,8 @@ import traceback
import os import os
import sys import sys
import re import re
from HTMLParser import HTMLParser, HTMLParseError from html.parser import HTMLParser, HTMLParseError
import urlparse import urllib.parse as urlparse
reHyperlink = re.compile(r'<a(\s+.*?)>', re.I) reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I) reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)
@ -57,7 +57,7 @@ class FindHyperlinks(HTMLParser):
pass pass
else: else:
self.printFile() self.printFile()
print ' WARNING: anchor "%s" appears more than once' % name print(' WARNING: anchor "%s" appears more than once' % name)
else: else:
self.anchors.add(name) self.anchors.add(name)
elif href is not None: elif href is not None:
@ -73,8 +73,8 @@ class FindHyperlinks(HTMLParser):
def printFile(self): def printFile(self):
if not self.printed: if not self.printed:
print print()
print ' ' + self.baseURL print(' ' + self.baseURL)
self.printed = True self.printed = True
def parse(baseURL, html): def parse(baseURL, html):
@ -85,8 +85,8 @@ def parse(baseURL, html):
parser.close() parser.close()
except HTMLParseError: except HTMLParseError:
parser.printFile() parser.printFile()
print ' WARNING: failed to parse %s:' % baseURL print(' WARNING: failed to parse %s:' % baseURL)
traceback.print_exc() traceback.print_exc(file=sys.stdout)
failures = True failures = True
return [], [] return [], []
@ -104,8 +104,8 @@ def checkAll(dirName):
global failures global failures
# Find/parse all HTML files first # Find/parse all HTML files first
print print()
print 'Crawl/parse...' print('Crawl/parse...')
allFiles = {} allFiles = {}
if os.path.isfile(dirName): if os.path.isfile(dirName):
@ -131,8 +131,8 @@ def checkAll(dirName):
allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f)).read()) allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f)).read())
# ... then verify: # ... then verify:
print print()
print 'Verify...' print('Verify...')
for fullPath, (links, anchors) in allFiles.items(): for fullPath, (links, anchors) in allFiles.items():
#print fullPath #print fullPath
printed = False printed = False
@ -176,16 +176,16 @@ def checkAll(dirName):
and os.path.basename(fullPath) != 'Changes.html': and os.path.basename(fullPath) != 'Changes.html':
if not printed: if not printed:
printed = True printed = True
print print()
print fullPath print(fullPath)
print ' BAD EXTERNAL LINK: %s' % link print(' BAD EXTERNAL LINK: %s' % link)
elif link.startswith('mailto:'): elif link.startswith('mailto:'):
if link.find('@lucene.apache.org') == -1 and link.find('@apache.org') != -1: if link.find('@lucene.apache.org') == -1 and link.find('@apache.org') != -1:
if not printed: if not printed:
printed = True printed = True
print print()
print fullPath print(fullPath)
print ' BROKEN MAILTO (?): %s' % link print(' BROKEN MAILTO (?): %s' % link)
elif link.startswith('javascript:'): elif link.startswith('javascript:'):
# ok...? # ok...?
pass pass
@ -200,15 +200,15 @@ def checkAll(dirName):
if not os.path.exists(link): if not os.path.exists(link):
if not printed: if not printed:
printed = True printed = True
print print()
print fullPath print(fullPath)
print ' BROKEN LINK: %s' % link print(' BROKEN LINK: %s' % link)
elif anchor is not None and anchor not in allFiles[link][1]: elif anchor is not None and anchor not in allFiles[link][1]:
if not printed: if not printed:
printed = True printed = True
print print()
print fullPath print(fullPath)
print ' BROKEN ANCHOR: %s' % origLink print(' BROKEN ANCHOR: %s' % origLink)
failures = failures or printed failures = failures or printed
@ -216,8 +216,8 @@ def checkAll(dirName):
if __name__ == '__main__': if __name__ == '__main__':
if checkAll(sys.argv[1]): if checkAll(sys.argv[1]):
print print()
print 'Broken javadocs links were found!' print('Broken javadocs links were found!')
sys.exit(1) sys.exit(1)
sys.exit(0) sys.exit(0)

View File

@ -200,6 +200,7 @@
<property name="moman.url" value="https://bitbucket.org/jpbarrette/moman" /> <property name="moman.url" value="https://bitbucket.org/jpbarrette/moman" />
<property name="moman.rev" value="120" /> <property name="moman.rev" value="120" />
<property name="python.exe" value="python" /> <property name="python.exe" value="python" />
<property name="python32.exe" value="python3.2" />
<property name="gpg.exe" value="gpg" /> <property name="gpg.exe" value="gpg" />
<property name="gpg.key" value="CODE SIGNING KEY" /> <property name="gpg.key" value="CODE SIGNING KEY" />
@ -1638,7 +1639,7 @@ ${tests-output}/junit4-*.suites - per-JVM executed suites
<macrodef name="check-broken-links"> <macrodef name="check-broken-links">
<attribute name="dir"/> <attribute name="dir"/>
<sequential> <sequential>
<exec dir="." executable="${python.exe}" failonerror="true"> <exec dir="." executable="${python32.exe}" failonerror="true">
<arg value="${dev-tools.dir}/scripts/checkJavadocLinks.py"/> <arg value="${dev-tools.dir}/scripts/checkJavadocLinks.py"/>
<arg value="@{dir}"/> <arg value="@{dir}"/>
</exec> </exec>