2020-01-13 19:09:56 +01:00

427 lines
17 KiB

* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
import java.nio.file.Files;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Parser;
import org.jsoup.parser.Tag;
* Check various things regarding anchors, links & general doc structure in the generated HTML site.
* <p>
* Usage: <code>java CheckLinksAndAnchors some-html-dir-name/ [-check-all-relative-links] [-bare-bones]</code>
* </p>
* <p>
* Problems this tool checks for...
* </p>
* <ul>
* <li>
* Validates that no file contains the same anchor more then once.
* </li>
* <li>
* Validates that relative links point to a file that actually exists, and if it's part of the ref-guide that the '#fragement' in the link refers to an ID that exists in that file.
* </li>
* <li>
* Our use of "<a href="">Bootstrap</a>" features leverage some custom javascript
* for manipulating the DOM to keep the markup needed in the source <code>*.adoc</code> files simple, but it's
* still possible users may create asciidctor "blocks" that break conventions (either in Bootstrap or in our
* custom javascript)
* </li>
* </ul>
* <p>
* This tool parses the generated HTML site, looking for these situations in order to fail the build, since
* (depending on the type of check) these situations will result in inconsistent/broken HTML.
* </p>
* <p>
* This tool supports 2 command line options:
* </p>
* <ul>
* <li><b>-check-all-relative-links</b><br />
* <p>By default, only relative links to files in the same directory (ie: not startin with
* <code>"../"</code> are checked for existence. This means that we can do a "quick" validatation of
* links to other ref-guide files, but ignore relative links to things outside of the ref-guide --
* such as javadocs that we may not currently have built. If this option is specified then we
* <em>also</em> check relative links where the path starts with <code>"../"</code>
* </p>
* </li>
* <li><b>-bare-bones</b><br/>
* <p>By default, this tool assumes it is analyzing Jekyll generated files. If this option is specified,
* then it instead assumes it's checking "bare bones" HTML files...
* </p>
* <ul>
* <li>Jekyll Mode:
* <ul>
* <li>Requires all html pages have a "content" div; ignores all DOM Nodes that are
* <em>not</em> decendents of this div (to exclude redundent template based header, footer,
* &amp; sidebar links)
* </li>
* <li>Expects that the <code>&lt;body/&gt;</code> tag will have an <code>id</code> matching
* the page shortname.</li>
* </ul>
* </li>
* <li>Bare Bones Mode:
* <ul>
* <li>Checks all links &amp; anchors in the page.</li>
* <li>"Fakes" the existence of a <code>&lt;body id="..."&gt;</code> tag containing the
* page shortname.</li>
* </ul>
* </li>
* </ul>
* </li>
* </ul>
* TODO: build a list of all known external links so that some other tool could (optionally) ping them all for 200 status?
* @see ""
* @see ""
public class CheckLinksAndAnchors { // TODO: rename this class now that it does more then just links & anchors
public static final class HtmlFileFilter implements FileFilter {
public boolean accept(File pathname) {
return pathname.getName().toLowerCase(Locale.ROOT).endsWith("html");
public static void main(String[] args) throws Exception {
int problems = 0;
if (args.length < 1) {
System.err.println("usage: CheckLinksAndAnchors <htmldir> [-check-all-relative-links] [-bare-bones]");
final File htmlDir = new File(args[0]);
final Set<String> options = new LinkedHashSet<>();
for (int i = 1; i < args.length; i++) {
if (! args[i].trim().isEmpty()) { // ignore blank options - maybe an ant sysprop blanked on purpose
final boolean bareBones = options.remove("-bare-bones");
final boolean checkAllRelativeLinks = options.remove("-check-all-relative-links");
if (! options.isEmpty()) {
for (String brokenOpt : options) {
System.err.println("CheckLinksAndAnchors: Unrecognized option: " + brokenOpt);
final File[] pages = htmlDir.listFiles(new HtmlFileFilter());
if (0 == pages.length) {
System.err.println("CheckLinksAndAnchors: No HTML Files found, wrong htmlDir? forgot to built the site?");
final Map<File,List<URI>> filesToRelativeLinks = new HashMap<>();
final Map<String,Set<String>> filesToIds = new HashMap<>();
int totalLinks = 0;
int totalRelativeLinks = 0;
int totalIds = 0;
for (File file : pages) {
//System.out.println("input File URI: " + file.toURI().toString());
assert ! filesToRelativeLinks.containsKey(file);
final List<URI> linksInThisFile = new ArrayList<URI>(17);
filesToRelativeLinks.put(file, linksInThisFile);
final Set<String> idsInThisFile = new LinkedHashSet<String>(17);
filesToIds.put(file.getName(), idsInThisFile);
// use this for error reporting if an ID exists multiple times in a single document
final Map<String,List<Element>> idsToNodes = new HashMap<>();
final String fileContents = readFile(file.getPath());
final Document doc = Jsoup.parse(fileContents);
// For Jekyll, we only care about class='content' -- we don't want to worry
// about ids/links duplicated in the header/footer of every page,
final String mainContentSelector = bareBones ? "body" : ".content";
final Element mainContent =;
if (mainContent == null) {
throw new RuntimeException(file.getName() + " has no main content: " + mainContentSelector);
// All of the ID (nodes) in (the content of) this doc
final Elements nodesWithIds ="[id]");
if (bareBones) {
// It's a pain in the ass to customize the HTML output structure asciidoctor's bare-bones html5 backend
// so instead we "fake" that the body tag contains the attribute we use in jekyll
nodesWithIds.add(new Element(Tag.valueOf("body"), "").attr("id", file.getName().replaceAll("\\.html$","")));
} else {
// We have to add Jekyll's <body> to the nodesWithIds so we check the main section anchor as well
// since we've already
boolean foundPreamble = false;
for (Element node : nodesWithIds) {
final String id =;
assert null != id;
assert 0 != id.length();
// special case id: we ignore the first 'preamble' because
// it's part of the core markup that asciidoctor always uses
// if we find it a second time in a single page, fail with a special error...
if (id.equals("preamble")) {
if (foundPreamble) {
System.err.println(file.toURI().toString() +
" contains 'preamble' anchor, this is special in jekyll and must not be used in content.");
} else {
foundPreamble = true;
continue; // Note: we specifically don't count this in totalIds
if (idsInThisFile.contains(id)) {
System.err.println(file.toURI().toString() + " contains ID multiple times: " + id);
totalIds++; // Note: we specifically don't count 'preamble'
// check for (relative) links that don't include a fragment
final Elements links ="a[href]");
for (Element link : links) {
final String href = link.attr("href");
if (0 == href.length()) {
System.err.println(file.toURI().toString() + " contains link with empty href");
try {
final URI uri = new URI(href);
if (! uri.isAbsolute()) {
final String frag = uri.getFragment();
if ((null == frag || "".equals(frag)) && ! uri.getPath().startsWith("../")) {
// we must have a fragment for intra-page links to work correctly
// but relative links "up and out" of ref-guide (Ex: local javadocs)
// don't require them (even if checkAllRelativeLinks is set)
System.err.println(file.toURI().toString() + " contains relative link w/o an '#anchor': " + href);
} else {
// track the link to validate it exists in the target doc
} catch (URISyntaxException uri_ex) {
// before reporting a problem, see if it can be parsed as a valid (absolute) URL
// some solr examples URLs have characters that aren't legal URI characters
// Example: "ipod^3.0", "foo:[*+TO+*]", etc...
boolean href_is_valid_absolute_url = false;
try {
// if this isn't absolute, it will fail
final URL ignored = new URL(href);
href_is_valid_absolute_url = true;
} catch (MalformedURLException url_ex) {
System.err.println(file.toURI().toString() + " contains link w/ invalid syntax: " + href);
System.err.println(" ... as URI: " + uri_ex.toString());
System.err.println(" ... as URL: " + url_ex.toString());
problems += validateHtmlStructure(file, mainContent);
// check every (realtive) link in every file to ensure the frag exists in the target page
for (Map.Entry<File,List<URI>> entry : filesToRelativeLinks.entrySet()) {
final File source = entry.getKey();
for (URI link : entry.getValue()) {
final String path = (null == link.getPath() || "".equals(link.getPath())) ? source.getName() : link.getPath();
final File dest = new File(htmlDir, path);
if ( ! dest.exists() ) {
// this is only a problem if it's in our dir, or checkAllRelativeLinks is set...
if (checkAllRelativeLinks || ! path.startsWith("../")) {
System.err.println("Relative link points at dest file that doesn't exist: " + link);
System.err.println(" ... source: " + source.toURI().toString());
} else {
if ( ! path.startsWith("../") ) {
// if the dest file is part of the ref guide (ie: not an "up and out" link to javadocs)
// then we validate the fragment is known and exists in that file...
final String frag = link.getFragment();
final Set<String> knownIdsInDest = filesToIds.get(dest.getName());
assert null != knownIdsInDest : dest.getName();
if (! knownIdsInDest.contains(frag) ) {
System.err.println("Relative link points at id that doesn't exist in dest: " + link);
System.err.println(" ... source: " + source.toURI().toString());
System.err.println("Processed " + totalLinks + " links (" + totalRelativeLinks + " relative) to " +
totalIds + " anchors in " + pages.length + " files");
if (0 < problems) {
System.err.println("Total of " + problems + " problems found");
static String readFile(String fileName) throws IOException {
InputStream in = new FileInputStream(fileName);
Reader reader = new InputStreamReader(in,"UTF-8");
BufferedReader br = new BufferedReader(reader);
try {
StringBuilder sb = new StringBuilder();
String line = br.readLine();
while (line != null) {
line = br.readLine();
return sb.toString();
} finally {
* returns the number of problems found with this file
private static int validateHtmlStructure(final File f, final Element mainContent) {
final String file = f.toURI().toString();
int problems = 0;
for (Element tab :".dynamic-tabs")) {
// must be at least two tab-pane decendents of each dynamic-tabs
final Elements panes =".tab-pane");
final int numPanes = panes.size();
if (numPanes < 2) {
System.err.println(file + " contains a 'dynamic-tabs' with "+ numPanes+" 'tab-pane' decendents -- must be at least 2");
// must not have any decendents of a dynamic-tabs that are not part of tab-pane
// this is kind of tricky, because asciidoctor creates wrapper divs around the tab-panes
// so we can't make assumptions about direct children
final Elements elementsToIgnore = panes.parents();
for (Element pane : panes) {
final Elements nonPaneDecendents ="*");
if (0 != nonPaneDecendents.size()) {
System.err.println(file + " contains a 'dynamic-tabs' with content outside of a 'tab-pane': " +
// Now fetch all tab-panes, even if they aren't in a dynamic-tabs instance
// (that's a type of error we want to check for)
final Elements validPanes =".dynamic-tabs .tab-pane");
final Elements allPanes =".tab-pane");
for (Element pane : allPanes) {
// every tab-pane must have an id
if ( {
System.err.println(file + " contains a 'tab-pane' that does not have a (unique) '#id'");
final String debug = "'tab-pane" + ( ? "" : "#" + + "'";
// no 'active' class on any tab-pane
if (pane.classNames().contains("active")) {
System.err.println(file + " contains " + debug + " with 'active' defined -- this must be removed");
// every tab-pane must be a decendent of a dynamic-tabs
if (! validPanes.contains(pane)) {
System.err.println(file + " contains " + debug + " that is not a decendent of a 'dynamic-tabs'");
// every tab-pane must have exactly 1 tab-label which is <strong>
Elements labels =".tab-label");
if (1 != labels.size()) {
System.err.println(file + " contains " + debug + " with " + labels.size() + " 'tab-label' decendents -- must be exactly 1");
} else {
Element label = labels.first();
if (! label.tagName().equals("strong")) {
System.err.println(file + " contains " + debug + " with a 'tab-label' using <"
+ labels.first().tagName() + "> -- each 'tab-label' must be <strong> (example: '[.tab-label]*Text*')");
final String labelText = label.text().trim();
// if the tab-label is the empty string, asciidoctor should optimize it away -- but let's check for it anyway
if (labelText.isEmpty()) {
System.err.println(file + " contains " + debug + " with a blank 'tab-label'");
// validate label must be first paragraph? first text content?
if (! pane.text().trim().startsWith(labelText)) {
System.err.println(file + " contains " + debug + " with text before the 'tab-label' ('" + labelText + "')");
return problems;
public static final String shortStr(String s) {
if (s.length() < 20) {
return s;
return s.substring(0, 17) + "...";