druid/docs/_bin/broken-link-check.py

102 lines
3.4 KiB
Python
Executable File

#!/usr/bin/env python3
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import re
import sys
#
# Checks for broken redirects (in _redirects.json) and links from markdown files to
# nonexistent pages. Does _not_ check for links to anchors that don't exist.
#
# Targets to these 'well known' pages are OK.
WELL_KNOWN_PAGES = ["/libraries.html", "/downloads.html", "/community/", "/thanks.html"]
def normalize_link(source, target):
dirname = os.path.dirname(source)
normalized = os.path.normpath(os.path.join(dirname, target))
return normalized
def verify_redirects(docs_directory, redirect_json):
ok = True
with open(redirect_json, 'r') as f:
redirects = json.loads(f.read())
for redirect in redirects:
if redirect["target"] in WELL_KNOWN_PAGES:
continue
# Replace .html and named anchors with .md, and check the file on the filesystem.
target = re.sub(r'\.html(#.*)?$', '.md', normalize_link(redirect["source"], redirect["target"]))
if not os.path.exists(os.path.join(docs_directory, target)):
sys.stderr.write('Redirect [' + redirect["source"] + '] target does not exist: ' + redirect["target"] + "\n")
ok = False
return ok
def verify_markdown(docs_directory):
ok = True
# Get list of markdown files.
markdowns = []
for root, dirs, files in os.walk(docs_directory):
for name in files:
if name.endswith('.md'):
markdowns.append(os.path.join(root, name))
for markdown_file in markdowns:
with open(markdown_file, 'r') as f:
content = f.read()
for m in re.finditer(r'\[([^\[]*?)\]\((.*?)(?: \"[^\"]+\")?\)', content):
target = m.group(2)
if target in WELL_KNOWN_PAGES:
continue
if markdown_file.endswith("/druid-kerberos.md") and target in ['regexp', 'druid@EXAMPLE.COM']:
# Hack to support the fact that rule examples in druid-kerberos docs look sort of like markdown links.
continue
target = re.sub(r'^/docs/VERSION/', '', target)
target = re.sub(r'#.*$', '', target)
target = re.sub(r'\.html$', '.md', target)
target = re.sub(r'/$', '/index.md', target)
if target and not (target.startswith('http://') or target.startswith('https://')):
target_normalized = normalize_link(markdown_file, target)
if not os.path.exists(target_normalized):
sys.stderr.write('Page [' + markdown_file + '] target does not exist: ' + m.group(2) + "\n")
ok = False
return ok
def main():
if len(sys.argv) != 3:
sys.stderr.write('usage: program <docs dir> <redirect.json>\n')
sys.exit(1)
ok = verify_redirects(sys.argv[1], sys.argv[2])
ok = verify_markdown(sys.argv[1]) and ok
if not ok:
sys.exit(1)
main()