# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import os import re reHREF = re.compile('(.*?)', re.IGNORECASE) reMarkup = re.compile('<.*?>') reDivBlock = re.compile('
(.*?)
', re.IGNORECASE) reCaption = re.compile('(.*?)', re.IGNORECASE) reJ8Caption = re.compile('

(.*?) Summary

') reTDLastNested = re.compile('^', re.IGNORECASE) reMethod = re.compile('^', re.IGNORECASE) reColOne = re.compile('^', re.IGNORECASE) reMemberNameLink = re.compile('^(.*?)', re.IGNORECASE) reMemberNameOneLink = re.compile('^Method Detail$', re.IGNORECASE) reMethodDetailAnchor = re.compile('^(?:)?$', re.IGNORECASE) reTag = re.compile("(?i)<(\/?\w+)((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>") def verifyHTML(s): stack = [] upto = 0 while True: m = reTag.search(s, upto) if m is None: break tag = m.group(1) upto = m.end(0) if tag[:1] == '/': justTag = tag[1:] else: justTag = tag if justTag.lower() in ('br', 'li', 'p', 'col'): continue if tag[:1] == '/': if len(stack) == 0: raise RuntimeError('saw closing "%s" without opening <%s...>' % (m.group(0), tag[1:])) elif stack[-1][0] != tag[1:].lower(): raise RuntimeError('closing "%s" does not match opening "%s"' % (m.group(0), stack[-1][1])) stack.pop() else: stack.append((tag.lower(), m.group(0))) if len(stack) != 0: raise RuntimeError('"%s" was never closed' % stack[-1][1]) def cleanHTML(s): s = reMarkup.sub('', s) s = s.replace(' ', ' ') s = s.replace('<', '<') s = s.replace('>', '>') s = s.replace('&', '&') return s.strip() reH3 = re.compile('^

(.*?)

', re.IGNORECASE | re.MULTILINE) reH4 = re.compile('^

(.*?)

', re.IGNORECASE | re.MULTILINE) reDetailsDiv = re.compile('
') reEndOfClassData = re.compile('') reBlockList = re.compile('
    ') reCloseUl = re.compile('
') def checkClassDetails(fullPath): """ Checks for invalid HTML in the full javadocs under each field/method. """ # TODO: only works with java7 generated javadocs now! with open(fullPath, encoding='UTF-8') as f: desc = [] cat = None item = None errors = [] inDetailsDiv = False blockListDepth = 0 for line in f.readlines(): # Skip content up until
if not inDetailsDiv: if reDetailsDiv.match(line) is not None: inDetailsDiv = True continue # Stop looking at content at closing details
, which is just before if reEndOfClassData.match(line) is not None: if len(desc) != 0: try: verifyHTML(''.join(desc)) except RuntimeError as re: #print(' FAILED: %s' % re) errors.append((cat, item, str(re))) break #
    is the boundary between items if reBlockList.match(line) is not None: blockListDepth += 1 if len(desc) != 0: try: verifyHTML(''.join(desc)) except RuntimeError as re: #print(' FAILED: %s' % re) errors.append((cat, item, str(re))) del desc[:] if blockListDepth == 3: desc.append(line) if reCloseUl.match(line) is not None: blockListDepth -= 1 else: m = reH3.search(line) if m is not None: cat = m.group(1) else: m = reH4.search(line) if m is not None: item = m.group(1) if len(errors) != 0: print() print(fullPath) for cat, item, message in errors: print(' broken details HTML: %s: %s: %s' % (cat, item, message)) return True else: return False def checkClassSummaries(fullPath): #print("check %s" % fullPath) # TODO: only works with java7 generated javadocs now! f = open(fullPath, encoding='UTF-8') missing = [] broken = [] inThing = False lastCaption = None lastItem = None desc = None foundMethodDetail = False lastMethodAnchor = None lineCount = 0 for line in f.readlines(): m = reMethodDetail.search(line) lineCount += 1 if m is not None: foundMethodDetail = True #print(' got method detail') continue # prune methods that are just @Overrides of other interface/classes, # they should be specified elsewhere, if they are e.g. jdk or # external classes we cannot inherit their docs anyway if foundMethodDetail: m = reMethodDetailAnchor.search(line) if m is not None: lastMethodAnchor = m.group(1) continue isOverrides = '>Overrides:<' in line or '>Specified by:<' in line #print('check for removing @overridden method: %s; %s; %s' % (lastMethodAnchor, isOverrides, missing)) if isOverrides and ('Methods', lastMethodAnchor) in missing: #print('removing @overridden method: %s' % lastMethodAnchor) missing.remove(('Methods', lastMethodAnchor)) m = reCaption.search(line) if m is not None: lastCaption = m.group(1) #print(' caption %s' % lastCaption) else: m = reJ8Caption.search(line) if m is not None: lastCaption = m.group(1) if not lastCaption.endswith('s'): lastCaption += 's' #print(' caption %s' % lastCaption) # Try to find the item in question (method/member name): for matcher in (reTDLastNested, # nested classes reMethod, # methods etc. reColOne, # ctors etc. reMemberNameLink, # java 8 reNestedClassMemberNameLink, # java 8, nested class reMemberNameOneLink): # java 8 ctors m = matcher.search(line) if m is not None: lastItem = m.group(1) #print(' found item %s; inThing=%s' % (lastItem, inThing)) break lineLower = line.strip().lower() if lineLower.find('') != -1: #print(' end item %s; hasDesc %s' % (lastItem, hasDesc)) if not hasDesc: if lastItem is None: raise RuntimeError('failed to locate javadoc item in %s, line %d? last line: %s' % (fullPath, lineCount, line.rstrip())) missing.append((lastCaption, unEscapeURL(lastItem))) #print(' add missing; now %d: %s' % (len(missing), str(missing))) inThing = False continue else: if line.find('
    ') != -1: desc = [] if desc is not None: desc.append(line) if line.find('
    ') != -1: desc = ''.join(desc) try: verifyHTML(desc) except RuntimeError as e: broken.append((lastCaption, lastItem, str(e))) #print('FAIL: %s: %s: %s: %s' % (lastCaption, lastItem, e, desc)) desc = desc.replace('
    ', '') desc = desc.replace('
    ', '') desc = desc.strip() hasDesc = len(desc) > 0 #print(' thing %s: %s' % (lastItem, desc)) desc = None f.close() if len(missing) > 0 or len(broken) > 0: print() print(fullPath) for (caption, item) in missing: print(' missing %s: %s' % (caption, item)) for (caption, item, why) in broken: print(' broken HTML: %s: %s: %s' % (caption, item, why)) return True else: return False def checkSummary(fullPath): printed = False f = open(fullPath, encoding='UTF-8') anyMissing = False sawPackage = False desc = [] lastHREF = None for line in f.readlines(): lineLower = line.strip().lower() if desc is not None: # TODO: also detect missing description in overview-summary if lineLower.startswith('package ') or lineLower.startswith('

    see: ') or lineLower.startswith('

    see:') or lineLower.startswith(''): desc = ' '.join(desc) desc = reMarkup.sub(' ', desc) desc = desc.strip() if desc == '': if not printed: print() print(fullPath) printed = True print(' no package description (missing package.html in src?)') anyMissing = True desc = None else: desc.append(lineLower) if lineLower in (' ', '', ' '): if not printed: print() print(fullPath) printed = True print(' missing description: %s' % unescapeHTML(lastHREF)) anyMissing = True elif lineLower.find('licensed to the apache software foundation') != -1 or lineLower.find('copyright 2004 the apache software foundation') != -1: if not printed: print() print(fullPath) printed = True print(' license-is-javadoc: %s' % unescapeHTML(lastHREF)) anyMissing = True m = reHREF.search(line) if m is not None: lastHREF = m.group(1) if desc is not None and fullPath.find('/overview-summary.html') == -1: raise RuntimeError('BUG: failed to locate description in %s' % fullPath) f.close() return anyMissing def unEscapeURL(s): # Not exhaustive!! s = s.replace('%20', ' ') s = s.replace('%5B', '[') s = s.replace('%5D', ']') return s def unescapeHTML(s): s = s.replace('<', '<') s = s.replace('>', '>') s = s.replace('&', '&') return s def checkPackageSummaries(root, level='class'): """ Just checks for blank summary lines in package-summary.html; returns True if there are problems. """ if level != 'class' and level != 'package' and level != 'method' and level != 'none': print('unsupported level: %s, must be "class" or "package" or "method" or "none"' % level) sys.exit(1) #for dirPath, dirNames, fileNames in os.walk('%s/lucene/build/docs/api' % root): if False: os.chdir(root) print() print('Run "ant javadocs" > javadocs.log...') if os.system('ant javadocs > javadocs.log 2>&1'): print(' FAILED') sys.exit(1) anyMissing = False if not os.path.isdir(root): checkClassSummaries(root) checkClassDetails(root) sys.exit(0) for dirPath, dirNames, fileNames in os.walk(root): if dirPath.find('/all/') != -1: # These are dups (this is a bit risk, eg, root IS this /all/ directory..) continue if 'package-summary.html' in fileNames: if (level == 'class' or level == 'method') and checkSummary('%s/package-summary.html' % dirPath): anyMissing = True for fileName in fileNames: fullPath = '%s/%s' % (dirPath, fileName) if not fileName.startswith('package-') and fileName.endswith('.html') and os.path.isfile(fullPath): if level == 'method': if checkClassSummaries(fullPath): anyMissing = True # always look for broken html, regardless of level supplied if checkClassDetails(fullPath): anyMissing = True if 'overview-summary.html' in fileNames: if level != 'none' and checkSummary('%s/overview-summary.html' % dirPath): anyMissing = True return anyMissing if __name__ == '__main__': if len(sys.argv) < 2 or len(sys.argv) > 3: print('usage: %s

    [none|package|class|method]' % sys.argv[0]) sys.exit(1) if len(sys.argv) == 2: level = 'class' else: level = sys.argv[2] if checkPackageSummaries(sys.argv[1], level): print() print('Missing javadocs were found!') sys.exit(1) sys.exit(0)