lucene/solr/solr-ref-guide/tools/CheckLinksAndAnchors.java

397 lines
16 KiB
Java
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.MalformedURLException;
import java.nio.file.Files;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Parser;
import org.jsoup.parser.Tag;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeVisitor;
/**
* Check various things regarding anchors, links & general doc structure in the generated HTML site.
*
* <p>
* Problems this tool checks for...
* </p>
*
* <ul>
* <li>
* Asciidoctor doesn't do a good job of rectifying situations where multiple documents are included in one
* massive (PDF) document may have identical anchors (either explicitly defined, or implicitly defined because of
* section headings). Asciidoctor also doesn't support linking directly to another (included) document by name,
* unless there is an explicit '#fragement' used in the link.
* </li>
* <li>
* Any "relative" link should point to a file that actually exists.
* </li>
* <li>
* Our use of "<a href="https://getbootstrap.com/">Bootstrap</a>" features leverage some custom javascript
* for manipulating the DOM to keep the markup needed in the source <code>*.adoc</code> files simple, but it's
* still possible users may create asciidctor "blocks" that break conventions (either in Bootstrap or in our
* custom javascript)
* </li>
* </ul>
*
* <p>
* This tool parses the generated HTML site, looking for these situations in order to fail the build, since
* (depending on the type of check) these situations will result in inconsistent/broken HTML, or equivilent
* problems in the generated PDF.
* </p>
* <p>
* This tool supports 2 modes, depending on wether you want to run it against the HTML generated by Jekyll, or
* the "bare bones" HTML generated directly by asciidoctor...
* </p>
* <ul>
* <li>Jekyll Mode:
* <ul>
* <li><code>CheckLinksAndAnchors html-dir-name/ [false]</li>
* <li>Requires all html pages have a "main-content" div; ignores all DOM Nodes that are
* <em>not</em> decendents of this div (to exclude redundent template based header, footer, &amp; sidebar links)
* </li>
* <li>Expects that the <code>&lt;body/&gt;</code> tag will have an <code>id</code> matching the page shortname.</li>
* </ul>
* </li>
* <li>Bare Bones Mode:
* <ul>
* <li><code>CheckLinksAndAnchors html-dir-name/ true</li>
* <li>Checks all links &amp; anchors in the page.</li>
* <li>"Fakes" the existence of a <code>&lt;body id="..."&gt;</code> tag containing the page shortname.</li>
* </ul>
* </li>
* </ul>
*
*
* TODO: build a list of all known external links so that some other tool could (optionally) ping them all for 200 status?
*
* @see https://github.com/asciidoctor/asciidoctor/issues/1865
* @see https://github.com/asciidoctor/asciidoctor/issues/1866
*/
public class CheckLinksAndAnchors { // TODO: rename this class now that it does more then just links & anchors
public static final class HtmlFileFilter implements FileFilter {
public boolean accept(File pathname) {
return pathname.getName().toLowerCase().endsWith("html");
}
}
public static void main(String[] args) throws Exception {
int problems = 0;
if (args.length < 1 || 2 < args.length ) {
System.err.println("usage: CheckLinksAndAnchors <htmldir> [<bare-bones-boolean>]");
System.exit(-1);
}
final File htmlDir = new File(args[0]);
final boolean bareBones = (2 == args.length) ? Boolean.parseBoolean(args[1]) : false;
final File[] pages = htmlDir.listFiles(new HtmlFileFilter());
if (0 == pages.length) {
System.err.println("No HTML Files found, wrong htmlDir? forgot to built the site?");
System.exit(-1);
}
final Map<String,List<File>> idsToFiles = new HashMap<>();
final Map<File,List<URI>> filesToRelativeLinks = new HashMap<>();
final Set<String> idsInMultiFiles = new HashSet<>(0);
int totalLinks = 0;
int totalRelativeLinks = 0;
for (File file : pages) {
//System.out.println("input File URI: " + file.toURI().toString());
assert ! filesToRelativeLinks.containsKey(file);
final List<URI> linksInThisFile = new ArrayList<URI>(17);
filesToRelativeLinks.put(file, linksInThisFile);
final String fileContents = readFile(file.getPath());
final Document doc = Jsoup.parse(fileContents);
// For Jekyll, we only care about class='main-content' -- we don't want to worry
// about ids/links duplicated in the header/footer of every page,
final String mainContentSelector = bareBones ? "body" : ".main-content";
final Element mainContent = doc.select(mainContentSelector).first();
if (mainContent == null) {
throw new RuntimeException(file.getName() + " has no main content: " + mainContentSelector);
}
// Add all of the IDs in (the main-content of) this doc to idsToFiles (and idsInMultiFiles if needed)
final Elements nodesWithIds = mainContent.select("[id]");
if (bareBones) {
// It's a pain in the ass to customize the HTML output structure asciidoctor's bare-bones html5 backend
// so instead we "fake" that the body tag contains the attribute we use in jekyll
// (and what gets added explicitly to each top level section in the PDF)
nodesWithIds.add(new Element(Tag.valueOf("body"), "").attr("id", file.getName().replaceAll("\\.html$","")));
} else {
// We have to add Jekyll's <body> to the nodesWithIds so we check the main section anchor as well
// since we've already
nodesWithIds.addAll(doc.select("body[id]"));
}
boolean foundPreamble = false;
for (Element node : nodesWithIds) {
final String id = node.id();
assert null != id;
assert 0 != id.length();
// special case id: we ignore the first 'preamble' because
// it's part of the core markup that asciidoctor always uses
// if we find it a second time in a single page, fail with a special error...
if (id.equals("preamble")) {
if (foundPreamble) {
problems++;
System.err.println(file.toURI().toString() +
" contains 'preamble' anchor, this is special in jekyll and must not be used in content.");
} else {
foundPreamble = true;
continue;
}
}
if (idsToFiles.containsKey(id)) {
idsInMultiFiles.add(id);
} else {
idsToFiles.put(id, new ArrayList<File>(1));
}
idsToFiles.get(id).add(file);
}
// check for (relative) links that don't include a fragment
final Elements links = mainContent.select("a[href]");
for (Element link : links) {
totalLinks++;
final String href = link.attr("href");
if (0 == href.length()) {
problems++;
System.err.println(file.toURI().toString() + " contains link with empty href");
}
try {
final URI uri = new URI(href);
if (! uri.isAbsolute()) {
totalRelativeLinks++;
final String frag = uri.getFragment();
if (null == frag || "".equals(frag)) {
// we must have a fragment for intra-page links to work correctly
problems++;
System.err.println(file.toURI().toString() + " contains relative link w/o an '#anchor': " + href);
} else {
// track the link to validate it exists in the target doc
linksInThisFile.add(uri);
}
}
} catch (URISyntaxException uri_ex) {
// before reporting a problem, see if it can be parsed as a valid (absolute) URL
// some solr examples URLs have characters that aren't legal URI characters
// Example: "ipod^3.0", "foo:[*+TO+*]", etc...
boolean href_is_valid_absolute_url = false;
try {
// if this isn't absolute, it will fail
final URL ignored = new URL(href);
href_is_valid_absolute_url = true;
} catch (MalformedURLException url_ex) {
problems++;
System.err.println(file.toURI().toString() + " contains link w/ invalid syntax: " + href);
System.err.println(" ... as URI: " + uri_ex.toString());
System.err.println(" ... as URL: " + url_ex.toString());
}
}
}
problems += validateHtmlStructure(file, mainContent);
}
// if there are problematic ids, report them
for (String id : idsInMultiFiles) {
problems++;
System.err.println("ID occurs multiple times: " + id);
for (File file : idsToFiles.get(id)) {
System.err.println(" ... " + file.toURI().toString());
}
}
// check every (realtive) link in every file to ensure the frag exists in the target page
for (Map.Entry<File,List<URI>> entry : filesToRelativeLinks.entrySet()) {
final File source = entry.getKey();
for (URI link : entry.getValue()) {
final String path = (null == link.getPath() || "".equals(link.getPath())) ? source.getName() : link.getPath();
final String frag = link.getFragment();
if ( ! idsInMultiFiles.contains(frag) ) { // skip problematic dups already reported
final File dest = new File(htmlDir, path);
if ( ! dest.exists() ) {
problems++;
System.err.println("Relative link points at dest file that doesn't exist: " + link);
System.err.println(" ... source: " + source.toURI().toString());
} else if ( ( ! idsToFiles.containsKey(frag) ) || // no file contains this id, or...
// id exists, but not in linked file
( ! idsToFiles.get(frag).get(0).getName().equals(path) )) {
problems++;
System.err.println("Relative link points at id that doesn't exist in dest: " + link);
System.err.println(" ... source: " + source.toURI().toString());
}
}
}
}
System.err.println("Processed " + totalLinks + " links (" + totalRelativeLinks + " relative) to " +
idsToFiles.size() + " anchors in " + pages.length + " files");
if (0 < problems) {
System.err.println("Total of " + problems + " problems found");
System.exit(-1);
}
}
static String readFile(String fileName) throws IOException {
InputStream in = new FileInputStream(fileName);
Reader reader = new InputStreamReader(in,"UTF-8");
BufferedReader br = new BufferedReader(reader);
try {
StringBuilder sb = new StringBuilder();
String line = br.readLine();
while (line != null) {
sb.append(line);
sb.append("\n");
line = br.readLine();
}
return sb.toString();
} finally {
br.close();
}
}
/**
* returns the number of problems found with this file
*/
private static int validateHtmlStructure(final File f, final Element mainContent) {
final String file = f.toURI().toString();
int problems = 0;
for (Element tab : mainContent.select(".dynamic-tabs")) {
// must be at least two tab-pane decendents of each dynamic-tabs
final Elements panes = tab.select(".tab-pane");
final int numPanes = panes.size();
if (numPanes < 2) {
System.err.println(file + " contains a 'dynamic-tabs' with "+ numPanes+" 'tab-pane' decendents -- must be at least 2");
problems++;
}
// must not have any decendents of a dynamic-tabs that are not part of tab-pane
//
// this is kind of tricky, because asciidoctor creates wrapper divs around the tab-panes
// so we can't make assumptions about direct children
//
final Elements elementsToIgnore = panes.parents();
for (Element pane : panes) {
elementsToIgnore.addAll(pane.select("*"));
}
final Elements nonPaneDecendents = tab.select("*");
nonPaneDecendents.removeAll(elementsToIgnore);
if (0 != nonPaneDecendents.size()) {
System.err.println(file + " contains a 'dynamic-tabs' with content outside of a 'tab-pane': " +
shortStr(nonPaneDecendents.text()));
problems++;
}
}
// Now fetch all tab-panes, even if they aren't in a dynamic-tabs instance
// (that's a type of error we want to check for)
final Elements validPanes = mainContent.select(".dynamic-tabs .tab-pane");
final Elements allPanes = mainContent.select(".tab-pane");
for (Element pane : allPanes) {
// every tab-pane must have an id
if (pane.id().trim().isEmpty()) {
System.err.println(file + " contains a 'tab-pane' that does not have a (unique) '#id'");
problems++;
}
final String debug = "'tab-pane" + (pane.id().isEmpty() ? "" : "#" + pane.id()) + "'";
// no 'active' class on any tab-pane
if (pane.classNames().contains("active")) {
System.err.println(file + " contains " + debug + " with 'active' defined -- this must be removed");
problems++;
}
// every tab-pane must be a decendent of a dynamic-tabs
if (! validPanes.contains(pane)) {
System.err.println(file + " contains " + debug + " that is not a decendent of a 'dynamic-tabs'");
problems++;
}
// every tab-pane must have exactly 1 tab-label which is <strong>
Elements labels = pane.select(".tab-label");
if (1 != labels.size()) {
System.err.println(file + " contains " + debug + " with " + labels.size() + " 'tab-label' decendents -- must be exactly 1");
problems++;
} else {
Element label = labels.first();
if (! label.tagName().equals("strong")) {
System.err.println(file + " contains " + debug + " with a 'tab-label' using <"
+ labels.first().tagName() + "> -- each 'tab-label' must be <strong> (example: '[.tab-label]*Text*')");
problems++;
}
final String labelText = label.text().trim();
// if the tab-label is the empty string, asciidoctor should optimize it away -- but let's check for it anyway
if (labelText.isEmpty()) {
System.err.println(file + " contains " + debug + " with a blank 'tab-label'");
problems++;
}
// validate label must be first paragraph? first text content?
if (! pane.text().trim().startsWith(labelText)) {
System.err.println(file + " contains " + debug + " with text before the 'tab-label' ('" + labelText + "')");
problems++;
}
}
}
return problems;
}
public static final String shortStr(String s) {
if (s.length() < 20) {
return s;
}
return s.substring(0, 17) + "...";
}
}