/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.*; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.net.MalformedURLException; import java.nio.file.Files; import java.util.Arrays; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.parser.Parser; import org.jsoup.parser.Tag; import org.jsoup.select.Elements; import org.jsoup.select.NodeVisitor; /** * Check various things regarding anchors, links & general doc structure in the generated HTML site. * *

* Problems this tool checks for... *

* * * *

* This tool parses the generated HTML site, looking for these situations in order to fail the build, since * (depending on the type of check) these situations will result in inconsistent/broken HTML, or equivilent * problems in the generated PDF. *

*

* This tool supports 2 modes, depending on wether you want to run it against the HTML generated by Jekyll, or * the "bare bones" HTML generated directly by asciidoctor... *

* * * * TODO: build a list of all known external links so that some other tool could (optionally) ping them all for 200 status? * * @see https://github.com/asciidoctor/asciidoctor/issues/1865 * @see https://github.com/asciidoctor/asciidoctor/issues/1866 */ public class CheckLinksAndAnchors { // TODO: rename this class now that it does more then just links & anchors public static final class HtmlFileFilter implements FileFilter { public boolean accept(File pathname) { return pathname.getName().toLowerCase().endsWith("html"); } } public static void main(String[] args) throws Exception { int problems = 0; if (args.length < 1 || 2 < args.length ) { System.err.println("usage: CheckLinksAndAnchors []"); System.exit(-1); } final File htmlDir = new File(args[0]); final boolean bareBones = (2 == args.length) ? Boolean.parseBoolean(args[1]) : false; final File[] pages = htmlDir.listFiles(new HtmlFileFilter()); if (0 == pages.length) { System.err.println("No HTML Files found, wrong htmlDir? forgot to built the site?"); System.exit(-1); } final Map> idsToFiles = new HashMap<>(); final Map> filesToRelativeLinks = new HashMap<>(); final Set idsInMultiFiles = new HashSet<>(0); int totalLinks = 0; int totalRelativeLinks = 0; for (File file : pages) { //System.out.println("input File URI: " + file.toURI().toString()); assert ! filesToRelativeLinks.containsKey(file); final List linksInThisFile = new ArrayList(17); filesToRelativeLinks.put(file, linksInThisFile); final String fileContents = readFile(file.getPath()); final Document doc = Jsoup.parse(fileContents); // For Jekyll, we only care about class='main-content' -- we don't want to worry // about ids/links duplicated in the header/footer of every page, final String mainContentSelector = bareBones ? "body" : ".main-content"; final Element mainContent = doc.select(mainContentSelector).first(); if (mainContent == null) { throw new RuntimeException(file.getName() + " has no main content: " + mainContentSelector); } // Add all of the IDs in (the main-content of) this doc to idsToFiles (and idsInMultiFiles if needed) final Elements nodesWithIds = mainContent.select("[id]"); if (bareBones) { // It's a pain in the ass to customize the HTML output structure asciidoctor's bare-bones html5 backend // so instead we "fake" that the body tag contains the attribute we use in jekyll // (and what gets added explicitly to each top level section in the PDF) nodesWithIds.add(new Element(Tag.valueOf("body"), "").attr("id", file.getName().replaceAll("\\.html$",""))); } else { // We have to add Jekyll's to the nodesWithIds so we check the main section anchor as well // since we've already nodesWithIds.addAll(doc.select("body[id]")); } boolean foundPreamble = false; for (Element node : nodesWithIds) { final String id = node.id(); assert null != id; assert 0 != id.length(); // special case id: we ignore the first 'preamble' because // it's part of the core markup that asciidoctor always uses // if we find it a second time in a single page, fail with a special error... if (id.equals("preamble")) { if (foundPreamble) { problems++; System.err.println(file.toURI().toString() + " contains 'preamble' anchor, this is special in jekyll and must not be used in content."); } else { foundPreamble = true; continue; } } if (idsToFiles.containsKey(id)) { idsInMultiFiles.add(id); } else { idsToFiles.put(id, new ArrayList(1)); } idsToFiles.get(id).add(file); } // check for (relative) links that don't include a fragment final Elements links = mainContent.select("a[href]"); for (Element link : links) { totalLinks++; final String href = link.attr("href"); if (0 == href.length()) { problems++; System.err.println(file.toURI().toString() + " contains link with empty href"); } try { final URI uri = new URI(href); if (! uri.isAbsolute()) { totalRelativeLinks++; final String frag = uri.getFragment(); if (null == frag || "".equals(frag)) { // we must have a fragment for intra-page links to work correctly problems++; System.err.println(file.toURI().toString() + " contains relative link w/o an '#anchor': " + href); } else { // track the link to validate it exists in the target doc linksInThisFile.add(uri); } } } catch (URISyntaxException uri_ex) { // before reporting a problem, see if it can be parsed as a valid (absolute) URL // some solr examples URLs have characters that aren't legal URI characters // Example: "ipod^3.0", "foo:[*+TO+*]", etc... boolean href_is_valid_absolute_url = false; try { // if this isn't absolute, it will fail final URL ignored = new URL(href); href_is_valid_absolute_url = true; } catch (MalformedURLException url_ex) { problems++; System.err.println(file.toURI().toString() + " contains link w/ invalid syntax: " + href); System.err.println(" ... as URI: " + uri_ex.toString()); System.err.println(" ... as URL: " + url_ex.toString()); } } } problems += validateHtmlStructure(file, mainContent); } // if there are problematic ids, report them for (String id : idsInMultiFiles) { problems++; System.err.println("ID occurs multiple times: " + id); for (File file : idsToFiles.get(id)) { System.err.println(" ... " + file.toURI().toString()); } } // check every (realtive) link in every file to ensure the frag exists in the target page for (Map.Entry> entry : filesToRelativeLinks.entrySet()) { final File source = entry.getKey(); for (URI link : entry.getValue()) { final String path = (null == link.getPath() || "".equals(link.getPath())) ? source.getName() : link.getPath(); final String frag = link.getFragment(); if ( ! idsInMultiFiles.contains(frag) ) { // skip problematic dups already reported final File dest = new File(htmlDir, path); if ( ! dest.exists() ) { problems++; System.err.println("Relative link points at dest file that doesn't exist: " + link); System.err.println(" ... source: " + source.toURI().toString()); } else if ( ( ! idsToFiles.containsKey(frag) ) || // no file contains this id, or... // id exists, but not in linked file ( ! idsToFiles.get(frag).get(0).getName().equals(path) )) { problems++; System.err.println("Relative link points at id that doesn't exist in dest: " + link); System.err.println(" ... source: " + source.toURI().toString()); } } } } System.err.println("Processed " + totalLinks + " links (" + totalRelativeLinks + " relative) to " + idsToFiles.size() + " anchors in " + pages.length + " files"); if (0 < problems) { System.err.println("Total of " + problems + " problems found"); System.exit(-1); } } static String readFile(String fileName) throws IOException { InputStream in = new FileInputStream(fileName); Reader reader = new InputStreamReader(in,"UTF-8"); BufferedReader br = new BufferedReader(reader); try { StringBuilder sb = new StringBuilder(); String line = br.readLine(); while (line != null) { sb.append(line); sb.append("\n"); line = br.readLine(); } return sb.toString(); } finally { br.close(); } } /** * returns the number of problems found with this file */ private static int validateHtmlStructure(final File f, final Element mainContent) { final String file = f.toURI().toString(); int problems = 0; for (Element tab : mainContent.select(".dynamic-tabs")) { // must be at least two tab-pane decendents of each dynamic-tabs final Elements panes = tab.select(".tab-pane"); final int numPanes = panes.size(); if (numPanes < 2) { System.err.println(file + " contains a 'dynamic-tabs' with "+ numPanes+" 'tab-pane' decendents -- must be at least 2"); problems++; } // must not have any decendents of a dynamic-tabs that are not part of tab-pane // // this is kind of tricky, because asciidoctor creates wrapper divs around the tab-panes // so we can't make assumptions about direct children // final Elements elementsToIgnore = panes.parents(); for (Element pane : panes) { elementsToIgnore.addAll(pane.select("*")); } final Elements nonPaneDecendents = tab.select("*"); nonPaneDecendents.removeAll(elementsToIgnore); if (0 != nonPaneDecendents.size()) { System.err.println(file + " contains a 'dynamic-tabs' with content outside of a 'tab-pane': " + shortStr(nonPaneDecendents.text())); problems++; } } // Now fetch all tab-panes, even if they aren't in a dynamic-tabs instance // (that's a type of error we want to check for) final Elements validPanes = mainContent.select(".dynamic-tabs .tab-pane"); final Elements allPanes = mainContent.select(".tab-pane"); for (Element pane : allPanes) { // every tab-pane must have an id if (pane.id().trim().isEmpty()) { System.err.println(file + " contains a 'tab-pane' that does not have a (unique) '#id'"); problems++; } final String debug = "'tab-pane" + (pane.id().isEmpty() ? "" : "#" + pane.id()) + "'"; // no 'active' class on any tab-pane if (pane.classNames().contains("active")) { System.err.println(file + " contains " + debug + " with 'active' defined -- this must be removed"); problems++; } // every tab-pane must be a decendent of a dynamic-tabs if (! validPanes.contains(pane)) { System.err.println(file + " contains " + debug + " that is not a decendent of a 'dynamic-tabs'"); problems++; } // every tab-pane must have exactly 1 tab-label which is Elements labels = pane.select(".tab-label"); if (1 != labels.size()) { System.err.println(file + " contains " + debug + " with " + labels.size() + " 'tab-label' decendents -- must be exactly 1"); problems++; } else { Element label = labels.first(); if (! label.tagName().equals("strong")) { System.err.println(file + " contains " + debug + " with a 'tab-label' using <" + labels.first().tagName() + "> -- each 'tab-label' must be (example: '[.tab-label]*Text*')"); problems++; } final String labelText = label.text().trim(); // if the tab-label is the empty string, asciidoctor should optimize it away -- but let's check for it anyway if (labelText.isEmpty()) { System.err.println(file + " contains " + debug + " with a blank 'tab-label'"); problems++; } // validate label must be first paragraph? first text content? if (! pane.text().trim().startsWith(labelText)) { System.err.println(file + " contains " + debug + " with text before the 'tab-label' ('" + labelText + "')"); problems++; } } } return problems; } public static final String shortStr(String s) { if (s.length() < 20) { return s; } return s.substring(0, 17) + "..."; } }