#!/usr/bin/env python3 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import os import re import sys # # Checks for broken redirects (in _redirects.json) and links from markdown files to # nonexistent pages. Does _not_ check for links to anchors that don't exist. # # Targets to these 'well known' pages are OK. WELL_KNOWN_PAGES = ["/libraries.html", "/downloads.html", "/community/", "/thanks.html"] def normalize_link(source, target): dirname = os.path.dirname(source) normalized = os.path.normpath(os.path.join(dirname, target)) return normalized def verify_redirects(docs_directory, redirect_json): ok = True with open(redirect_json, 'r') as f: redirects = json.loads(f.read()) for redirect in redirects: if redirect["target"] in WELL_KNOWN_PAGES: continue # Replace .html and named anchors with .md, and check the file on the filesystem. target = re.sub(r'\.html(#.*)?$', '.md', normalize_link(redirect["source"], redirect["target"])) if not os.path.exists(os.path.join(docs_directory, target)): sys.stderr.write('Redirect [' + redirect["source"] + '] target does not exist: ' + redirect["target"] + "\n") ok = False return ok def verify_markdown(docs_directory): ok = True # Get list of markdown files. markdowns = [] for root, dirs, files in os.walk(docs_directory): for name in files: if name.endswith('.md'): markdowns.append(os.path.join(root, name)) for markdown_file in markdowns: with open(markdown_file, 'r') as f: content = f.read() for m in re.finditer(r'\[([^\[]*?)\]\((.*?)(?: \"[^\"]+\")?\)', content): target = m.group(2) if target in WELL_KNOWN_PAGES: continue if markdown_file.endswith("/druid-kerberos.md") and target in ['regexp', 'druid@EXAMPLE.COM']: # Hack to support the fact that rule examples in druid-kerberos docs look sort of like markdown links. continue target = re.sub(r'^/docs/VERSION/', '', target) target = re.sub(r'#.*$', '', target) target = re.sub(r'\.html$', '.md', target) target = re.sub(r'/$', '/index.md', target) if target and not (target.startswith('http://') or target.startswith('https://')): target_normalized = normalize_link(markdown_file, target) if not os.path.exists(target_normalized): sys.stderr.write('Page [' + markdown_file + '] target does not exist: ' + m.group(2) + "\n") ok = False return ok def main(): if len(sys.argv) != 3: sys.stderr.write('usage: program \n') sys.exit(1) ok = verify_redirects(sys.argv[1], sys.argv[2]) ok = verify_markdown(sys.argv[1]) and ok if not ok: sys.exit(1) main()