SOLR-10934: ref-guide link+anchor checking that doesn't require jekyll

This commit is contained in:
Chris Hostetter 2017-11-03 10:44:03 -07:00
parent 1d2787464f
commit 7f033ac12b
4 changed files with 231 additions and 62 deletions

View File

@ -113,17 +113,19 @@
<target name="build-tools-jar" depends="resolve" description="Builds the custom java tools use use for generating some data files from page metdata">
<mkdir dir="${build.dir}/classes"/>
<!-- NOTE: we include the ant runtime so we can compile our customized version of the asciidoctor ant task -->
<javac debug="yes"
debuglevel="source,lines,vars"
destdir="${build.dir}/classes"
includeantruntime="false">
includeantruntime="true">
<compilerarg value="-Xlint:all"/>
<classpath refid="tools-compile-classpath"/>
<src path="tools/"/>
</javac>
<copy todir="${build.dir}/classes" file="tools/asciidoctor-antlib.xml" />
<jar destfile="${build.dir}/${tools-jar-name}">
<fileset dir="${build.dir}/classes"
includes="**/*.class"/>
includes="**/*.class,**/*.xml"/>
</jar>
</target>
@ -164,58 +166,80 @@
</java>
</target>
<target name="check-links-and-anchors" depends="build-init,build-tools-jar" description="Parse the HTML site files to check for problematic links or anchors">
<java classname="CheckLinksAndAnchors"
failonerror="true"
fork="true">
<classpath refid="tools-run-classpath"/>
<arg value="${build.dir}/html-site"/>
</java>
</target>
<macrodef name="asciidoctor-convert">
<!-- custom macro that fills in all the defaults we care about when running asciidoctor-ant
The primary purpose for this is to build the PDF, but we also use it to build a bare-bones
HTML version for validating the document structure (ie: duplicate anchors, links all point to valid anchors,
etc...) that we can't do with the generated PDF, and that we want to be able to validate
even if the current user doesn't have jekyll installed
-->
<attribute name="sourceDirectory"/>
<attribute name="sourceDocumentName"/>
<attribute name="outputDirectory"/>
<attribute name="backend"/>
<attribute name="headerFooter" default="true" />
<sequential>
<!-- NOTE: we have our own variant on the asciidoctor-ant task, so that sourceDocumentName=""
is treated the same as if it's unset (ie: null)
-->
<taskdef uri="antlib:org.asciidoctor.ant" resource="asciidoctor-antlib.xml"
classpathref="tools-run-classpath"/>
<asciidoctor:convert
sourceDirectory="@{sourceDirectory}"
sourceDocumentName="@{sourceDocumentName}"
baseDir="${build.content.dir}"
outputDirectory="@{outputDirectory}"
preserveDirectories="true"
backend="@{backend}"
headerFooter="@{headerFooter}"
extensions="adoc"
sourceHighlighter="coderay"
imagesDir="${build.content.dir}"
doctype="book"
safemode="unsafe">
<attribute key="section-toc" value='' /><!-- we don't use these in the pdf -->
<attribute key="icons" value="font" />
<attribute key="icon-set" value="fa" />
<attribute key="pdf-stylesDir" value="./pdf/themes"/>
<attribute key="pdf-style" value="refguide"/>
<attribute key="pdf-fontsDir" value="./fonts"/>
<attribute key="figure-caption!" value='' />
<attribute key="idprefix" value='' />
<attribute key="idseparator" value='-' />
<!-- attributes used in adoc files -->
<!-- NOTE: If you add any attributes here for use in adoc files, you almost certainly need to also add
them to the _config.yml.template file for building the jekyll site as well
-->
<attribute key="solr-guide-draft-status" value="${solr-guide-draft-status}" />
<attribute key="solr-guide-version" value="${solr-guide-version}" />
<attribute key="solr-docs-version" value="${solr-docs-version}" />
<attribute key="solr-javadocs" value="${solr-javadocs}" />
<attribute key="lucene-javadocs" value="${lucene-javadocs}" />
<attribute key="build-date" value="${DSTAMP}" />
<attribute key="build-year" value="${current.year}" />
</asciidoctor:convert>
</sequential>
</macrodef>
<!-- ====== PDF Build ======= -->
<target name="build-pdf" depends="-build-raw-pdf,-reduce-pdf-size" description="Builds a PDF">
<target name="build-pdf" depends="bare-bones-html-validation,-build-pdf-and-reduce-pdf"
description="Builds the PDF (after building &amp; validating a bare-bones html version)" />
<target name="-build-pdf-and-reduce-pdf" depends="-build-raw-pdf,-reduce-pdf-size">
<!-- NOTE: this does everything realted to building the PDF, but skips the bare-bones-html validation -->
<echo>Finished Building ${build.dir}/${pdf-filename}</echo>
</target>
<target name="-build-raw-pdf"
depends="build-nav-data-files,resolve">
<mkdir dir="${build.dir}/pdf-tmp"/>
<taskdef uri="antlib:org.asciidoctor.ant" resource="org/asciidoctor/ant/antlib.xml"
classpathref="tools-run-classpath"/>
<asciidoctor:convert
sourceDirectory="${build.content.dir}/pdf"
sourceDocumentName="SolrRefGuide-all.adoc"
baseDir="${build.content.dir}"
outputDirectory="${build.dir}/pdf-tmp"
backend="pdf"
extensions="adoc"
sourceHighlighter="coderay"
imagesDir="${build.content.dir}"
doctype="book"
safemode="unsafe">
<attribute key="section-toc" value='' /><!-- we don't use these in the pdf -->
<attribute key="icons" value="font" />
<attribute key="icon-set" value="fa" />
<attribute key="pdf-stylesDir" value="./pdf/themes"/>
<attribute key="pdf-style" value="refguide"/>
<attribute key="pdf-fontsDir" value="./fonts"/>
<attribute key="figure-caption!" value='' />
<attribute key="idprefix" value='' />
<attribute key="idseparator" value='-' />
<!-- attributes used in adoc files -->
<!-- NOTE: If you add any attributes here for use in adoc files, you almost certainly need to also add
them to the _config.yml.template file for building the jekyll site as well
-->
<attribute key="solr-guide-draft-status" value="${solr-guide-draft-status}" />
<attribute key="solr-guide-version" value="${solr-guide-version}" />
<attribute key="solr-docs-version" value="${solr-docs-version}" />
<attribute key="solr-javadocs" value="${solr-javadocs}" />
<attribute key="lucene-javadocs" value="${lucene-javadocs}" />
<attribute key="build-date" value="${DSTAMP}" />
<attribute key="build-year" value="${current.year}" />
</asciidoctor:convert>
<asciidoctor-convert sourceDirectory="${build.content.dir}/pdf"
sourceDocumentName="SolrRefGuide-all.adoc"
outputDirectory="${build.dir}/pdf-tmp"
backend="pdf"
/>
<move file="${build.dir}/pdf-tmp/SolrRefGuide-all.pdf" tofile="${build.dir}/pdf-tmp/RAW-${pdf-filename}" />
</target>
<target name="-reduce-pdf-size" depends="build-init,build-tools-jar">
<java classname="ReducePDFSize"
failonerror="true"
@ -232,24 +256,61 @@
Builds site with Jekyll.
This (for now) assumes that Jekyll (http://jekyllrb.com) is installed locally. -->
<target name="build-site"
depends="-build-site,check-links-and-anchors"
depends="-build-site"
description="Builds an HTML Site w/Jekyll and verifies the anchors+links are valid" >
<java classname="CheckLinksAndAnchors"
failonerror="true"
fork="true">
<classpath refid="tools-run-classpath"/>
<arg value="${build.dir}/html-site"/>
</java>
<echo>Ready to browse site: ${build.dir}/html-site/${main-page}.html</echo>
</target>
<target name="-build-site"
depends="build-init,build-nav-data-files"
description="Builds an HTML Site w/Jekyll">
depends="build-init,build-nav-data-files" >
<echo>Running Jekyll...</echo>
<exec executable="jekyll" dir="${build.content.dir}">
<arg value="build"/>
</exec>
</target>
<!-- ======= HTML Bare Bones Conversion =======
Does a very raw converstion of the adoc files to HTML for the purpose of link & anchor checking
Unlike the "HTML Site Build" above, this does *NOT* require Jekyll, and can be done entirely
With ivy deps fetched automatically (just like the PDF)
-->
<target name="bare-bones-html-validation" depends="build-init,build-nav-data-files"
description="Builds (w/o Jekyll) a very simple html version of the guide and runs link/anchor validation on it">
<delete dir="${build.dir}/bare-bones-html"/>
<mkdir dir="${build.dir}/bare-bones-html"/>
<asciidoctor-convert sourceDirectory="${build.content.dir}"
sourceDocumentName=""
outputDirectory="${build.dir}/bare-bones-html"
headerFooter="false"
backend="html5"
/>
<java classname="CheckLinksAndAnchors"
failonerror="true"
fork="true">
<classpath refid="tools-run-classpath"/>
<arg value="${build.dir}/bare-bones-html"/>
<arg value="true" />
</java>
<echo>Validated Links &amp; Anchors via: ${build.dir}/bare-bones-html/</echo>
</target>
<target name="default"
description="Builds both a PDF and HTML versions of the ref guide"
depends="build-pdf,build-site">
depends="-build-pdf-and-reduce-pdf,build-site">
<!-- NOTE: we don't depend on build-pdf because then we'd also get the bare-bones HTML and do
link validation twice -->
<echo>PDF: ${build.dir}/${pdf-filename}</echo>
<echo>SITE: ${build.dir}/html-site/${main-page}.html</echo>
</target>
</project>

View File

@ -45,18 +45,41 @@ import org.jsoup.select.Elements;
import org.jsoup.select.NodeVisitor;
/**
* Check various things regarding links in the generated HTML site.
* Check various things regarding anchors &amp; links in the generated HTML site.
* <p>
* Asciidoctor doesn't do a good job of rectifying situations where multiple documents are included in one
* massive (PDF) document may have identical anchors (either explicitly defined, or implicitly defined because of
* section headings). Asciidoctor also doesn't support linking directly to another (included) document by name,
* unless there is an explicit '#fragement' used inthe link.
* unless there is an explicit '#fragement' used in the link.
* </p>
* <p>
* This tool parses the generated HTML site, looking for these situations in order to fail the build -- since the
* equivilent PDF will be broken. It also does sme general check of the relative URLs to ensure the destination
* equivilent PDF will be broken. It also does some general check of the relative URLs to ensure the destination
* files/anchors actaully exist.
* </p>
* <p>
* This tool supports 2 modes, depending on wether you want to run it against the HTML generated by Jekyll, or
* the "bare bones" HTML generated directly by asciidoctor...
* </p>
* <ul>
* <li>Jekyll Mode:
* <ul>
* <li><code>CheckLinksAndAnchors html-dir-name/ [false]</li>
* <li>Requires all html pages have a "main-content" div; ignores all links &amp; anchors that
* are <em>not</em> decendents of this div (to exclude redundent template based header, footer, &amp; sidebar links)
* </li>
* <li>Expects that the <code>&lt;body/&gt;</code> tag will have an <code>id</code> matching the page shortname.</li>
* </ul>
* </li>
* <li>Bare Bones Mode:
* <ul>
* <li><code>CheckLinksAndAnchors html-dir-name/ true</li>
* <li>Checks all links &amp; anchors in the page.</li>
* <li>"Fakes" the existence of a <code>&lt;body id="..."&gt;</code> tag containing the page shortname.</li>
* </ul>
* </li>
* </ul>
*
*
* TODO: build a list of all known external links so that some other tool could (optionally) ping them all for 200 status?
*
@ -74,11 +97,12 @@ public class CheckLinksAndAnchors {
public static void main(String[] args) throws Exception {
int problems = 0;
if (args.length != 1) {
System.err.println("usage: CheckLinksAndAnchors <htmldir>");
if (args.length < 1 || 2 < args.length ) {
System.err.println("usage: CheckLinksAndAnchors <htmldir> [<bare-bones-boolean>]");
System.exit(-1);
}
final File htmlDir = new File(args[0]);
final boolean bareBones = (2 == args.length) ? Boolean.parseBoolean(args[1]) : false;
final File[] pages = htmlDir.listFiles(new HtmlFileFilter());
if (0 == pages.length) {
@ -89,6 +113,9 @@ public class CheckLinksAndAnchors {
final Map<String,List<File>> idsToFiles = new HashMap<>();
final Map<File,List<URI>> filesToRelativeLinks = new HashMap<>();
final Set<String> idsInMultiFiles = new HashSet<>(0);
int totalLinks = 0;
int totalRelativeLinks = 0;
for (File file : pages) {
//System.out.println("input File URI: " + file.toURI().toString());
@ -99,25 +126,47 @@ public class CheckLinksAndAnchors {
final String fileContents = readFile(file.getPath());
final Document doc = Jsoup.parse(fileContents);
// we only care about class='main-content' -- we don't want to worry
// For Jekyll, we only care about class='main-content' -- we don't want to worry
// about ids/links duplicated in the header/footer of every page,
final Element mainContent = doc.select(".main-content").first();
final String mainContentSelector = bareBones ? "body" : ".main-content";
final Element mainContent = doc.select(mainContentSelector).first();
if (mainContent == null) {
throw new RuntimeException(file.getName() + " has no main-content div");
throw new RuntimeException(file.getName() + " has no main content: " + mainContentSelector);
}
// Add all of the IDs in (the main-content of) this doc to idsToFiles (and idsInMultiFiles if needed)
final Elements nodesWithIds = mainContent.select("[id]");
// NOTE: add <body> to the nodesWithIds so we check the main section anchor as well
nodesWithIds.addAll(doc.select("body[id]"));
if (bareBones) {
// It's a pain in the ass to customize the HTML output structure asciidoctor's bare-bones html5 backend
// so instead we "fake" that the body tag contains the attribute we use in jekyll
// (and what gets added explicitly to each top level section in the PDF)
nodesWithIds.add(new Element(Tag.valueOf("body"), "").attr("id", file.getName().replaceAll("\\.html$","")));
} else {
// We have to add Jekyll's <body> to the nodesWithIds so we check the main section anchor as well
// since we've already
nodesWithIds.addAll(doc.select("body[id]"));
}
boolean foundPreamble = false;
for (Element node : nodesWithIds) {
final String id = node.id();
assert null != id;
assert 0 != id.length();
// special case ids that we ignore
// special case id: we ignore the first 'preamble' because
// it's part of the core markup that asciidoctor always uses
// if we find it a second time in a single page, fail with a special error...
if (id.equals("preamble")) {
continue;
if (foundPreamble) {
problems++;
System.err.println(file.toURI().toString() +
" contains 'preamble' anchor, this is special in jekyll and must not be used in content.");
} else {
foundPreamble = true;
continue;
}
}
if (idsToFiles.containsKey(id)) {
@ -131,6 +180,7 @@ public class CheckLinksAndAnchors {
// check for (relative) links that don't include a fragment
final Elements links = mainContent.select("a[href]");
for (Element link : links) {
totalLinks++;
final String href = link.attr("href");
if (0 == href.length()) {
problems++;
@ -139,6 +189,7 @@ public class CheckLinksAndAnchors {
try {
final URI uri = new URI(href);
if (! uri.isAbsolute()) {
totalRelativeLinks++;
final String frag = uri.getFragment();
if (null == frag || "".equals(frag)) {
// we must have a fragment for intra-page links to work correctly
@ -200,7 +251,8 @@ public class CheckLinksAndAnchors {
}
}
System.err.println("Processed " + totalLinks + " links (" + totalRelativeLinks + " relative) to " +
idsToFiles.size() + " anchors in " + pages.length + " files");
if (0 < problems) {
System.err.println("Total of " + problems + " problems found");
System.exit(-1);

View File

@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.asciidoctor.ant.AsciidoctorAntTask;
/**
* Customized version of the default AsciidoctorAntTask
* To deal with the fact that we want sourceDocumentName="" treated the same as unspecified (ie: null)
* in order to be able to wrap in a macro with defaults
*/
public class CustomizedAsciidoctorAntTask extends AsciidoctorAntTask {
@SuppressWarnings("UnusedDeclaration")
public void setSourceDocumentName(String sourceDocumentName) {
if ("".equals(sourceDocumentName)) {
sourceDocumentName = null;
}
super.setSourceDocumentName(sourceDocumentName);
}
}

View File

@ -0,0 +1,22 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<antlib>
<typedef name="convert" classname="CustomizedAsciidoctorAntTask"/>
</antlib>