druid/docs/_bin/broken-link-check.py

#!/usr/bin/env python3

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import re
import sys

#
# Checks for broken redirects (in _redirects.json) and links from markdown files to
# nonexistent pages. Does _not_ check for links to anchors that don't exist.
#

# Targets to these 'well known' pages are OK.
WELL_KNOWN_PAGES = ["/libraries.html", "/downloads.html", "/community/", "/thanks.html"]

def normalize_link(source, target):
  dirname = os.path.dirname(source)
  normalized = os.path.normpath(os.path.join(dirname, target))
  return normalized

def verify_redirects(docs_directory, redirect_json):
  ok = True

  with open(redirect_json, 'r') as f:
    redirects = json.loads(f.read())

  for redirect in redirects:
    if redirect["target"] in WELL_KNOWN_PAGES:
      continue

    # Replace .html and named anchors with .md, and check the file on the filesystem.
    target = re.sub(r'\.html(#.*)?$', '.md', normalize_link(redirect["source"], redirect["target"]))
    if not os.path.exists(os.path.join(docs_directory, target)):
      sys.stderr.write('Redirect [' + redirect["source"] + '] target does not exist: ' + redirect["target"] + "\n")
      ok = False

  return ok

def verify_markdown(docs_directory):
  ok = True

  # Get list of markdown files.
  markdowns = []
  for root, dirs, files in os.walk(docs_directory):
    for name in files:
      if name.endswith('.md'):
        markdowns.append(os.path.join(root, name))

  for markdown_file in markdowns:
    with open(markdown_file, 'r') as f:
      content = f.read()

    for m in re.finditer(r'\[([^\[]*?)\]\((.*?)(?: \"[^\"]+\")?\)', content):
      target = m.group(2)

      if target in WELL_KNOWN_PAGES:
        continue

      if markdown_file.endswith("/druid-kerberos.md") and target in ['regexp', 'druid@EXAMPLE.COM']:
        # Hack to support the fact that rule examples in druid-kerberos docs look sort of like markdown links.
        continue

      target = re.sub(r'^/docs/VERSION/', '', target)
      target = re.sub(r'#.*$', '', target)
      target = re.sub(r'\.html$', '.md', target)
      target = re.sub(r'/$', '/index.md', target)
      if target and not (target.startswith('http://') or target.startswith('https://')):
        target_normalized = normalize_link(markdown_file, target)

        if not os.path.exists(target_normalized):
          sys.stderr.write('Page     [' + markdown_file + '] target does not exist: ' + m.group(2) + "\n")
          ok = False

  return ok

def main():
  if len(sys.argv) != 3:
    sys.stderr.write('usage: program <docs dir> <redirect.json>\n')
    sys.exit(1)

  ok = verify_redirects(sys.argv[1], sys.argv[2])
  ok = verify_markdown(sys.argv[1]) and ok
  if not ok:
    sys.exit(1)

main()