From 7f033ac12bb290b2cbf5e43672932c31e8b0061a Mon Sep 17 00:00:00 2001
From: Chris Hostetter <hossman@apache.org>
Date: Fri, 3 Nov 2017 10:44:03 -0700
Subject: [PATCH] SOLR-10934: ref-guide link+anchor checking that doesn't
 require jekyll

---
 solr/solr-ref-guide/build.xml                 | 159 ++++++++++++------
 .../tools/CheckLinksAndAnchors.java           |  78 +++++++--
 .../tools/CustomizedAsciidoctorAntTask.java   |  34 ++++
 .../tools/asciidoctor-antlib.xml              |  22 +++
 4 files changed, 231 insertions(+), 62 deletions(-)
 create mode 100644 solr/solr-ref-guide/tools/CustomizedAsciidoctorAntTask.java
 create mode 100644 solr/solr-ref-guide/tools/asciidoctor-antlib.xml
diff --git a/solr/solr-ref-guide/build.xml b/solr/solr-ref-guide/build.xml
index 1ce3778e9f0..6f989b91194 100644
--- a/solr/solr-ref-guide/build.xml
+++ b/solr/solr-ref-guide/build.xml
@@ -113,17 +113,19 @@
 
   <target name="build-tools-jar" depends="resolve" description="Builds the custom java tools use use for generating some data files from page metdata">
     <mkdir dir="${build.dir}/classes"/>
+    <!-- NOTE: we include the ant runtime so we can compile our customized version of the asciidoctor ant task -->
     <javac debug="yes"
            debuglevel="source,lines,vars"
            destdir="${build.dir}/classes"
-           includeantruntime="false">
+           includeantruntime="true">
       <compilerarg value="-Xlint:all"/>
       <classpath refid="tools-compile-classpath"/>
       <src path="tools/"/>
     </javac>
+    <copy todir="${build.dir}/classes" file="tools/asciidoctor-antlib.xml" />
     <jar destfile="${build.dir}/${tools-jar-name}">
       <fileset dir="${build.dir}/classes"
-               includes="**/*.class"/>
+               includes="**/*.class,**/*.xml"/>
     </jar>
   </target>
 
@@ -164,58 +166,80 @@
     </java>
   </target>
 
-  <target name="check-links-and-anchors" depends="build-init,build-tools-jar" description="Parse the HTML site files to check for problematic links or anchors">
-    <java classname="CheckLinksAndAnchors"
-          failonerror="true"
-          fork="true">
-      <classpath refid="tools-run-classpath"/>
-      <arg value="${build.dir}/html-site"/>
-    </java>
-  </target>
+  <macrodef name="asciidoctor-convert">
+    <!-- custom macro that fills in all the defaults we care about when running asciidoctor-ant
+         The primary purpose for this is to build the PDF, but we also use it to build a bare-bones
+         HTML version for validating the document structure (ie: duplicate anchors, links all point to valid anchors,
+         etc...) that we can't do with the generated PDF, and that we want to be able to validate
+         even if the current user doesn't have jekyll installed
+    -->
+    <attribute name="sourceDirectory"/>
+    <attribute name="sourceDocumentName"/>
+    <attribute name="outputDirectory"/>
+    <attribute name="backend"/>
+    <attribute name="headerFooter" default="true" />
+    <sequential>
+      <!-- NOTE: we have our own variant on the asciidoctor-ant task, so that sourceDocumentName=""
+           is treated the same as if it's unset (ie: null)
+      -->
+      <taskdef uri="antlib:org.asciidoctor.ant" resource="asciidoctor-antlib.xml"
+               classpathref="tools-run-classpath"/>
+      <asciidoctor:convert
+                   sourceDirectory="@{sourceDirectory}"
+                   sourceDocumentName="@{sourceDocumentName}"
+                   baseDir="${build.content.dir}"
+                   outputDirectory="@{outputDirectory}"
+                   preserveDirectories="true"
+                   backend="@{backend}"
+                   headerFooter="@{headerFooter}"
+                   extensions="adoc"
+                   sourceHighlighter="coderay"
+                   imagesDir="${build.content.dir}"
+                   doctype="book"
+                   safemode="unsafe">
+        <attribute key="section-toc" value='' /><!-- we don't use these in the pdf -->
+        <attribute key="icons" value="font" />
+        <attribute key="icon-set" value="fa" />
+        <attribute key="pdf-stylesDir" value="./pdf/themes"/>
+        <attribute key="pdf-style" value="refguide"/>
+        <attribute key="pdf-fontsDir" value="./fonts"/>
+        <attribute key="figure-caption!" value='' />
+        <attribute key="idprefix" value='' />
+        <attribute key="idseparator" value='-' />
+        <!-- attributes used in adoc files -->
+        <!-- NOTE: If you add any attributes here for use in adoc files, you almost certainly need to also add
+             them to the _config.yml.template file for building the jekyll site as well
+        -->
+        <attribute key="solr-guide-draft-status" value="${solr-guide-draft-status}" />
+        <attribute key="solr-guide-version" value="${solr-guide-version}" />
+        <attribute key="solr-docs-version" value="${solr-docs-version}" />
+        <attribute key="solr-javadocs" value="${solr-javadocs}" />
+        <attribute key="lucene-javadocs" value="${lucene-javadocs}" />
+        <attribute key="build-date" value="${DSTAMP}" />
+        <attribute key="build-year" value="${current.year}" />
+      </asciidoctor:convert>
+    </sequential>
+  </macrodef>
 
+  
   <!-- ====== PDF Build ======= -->
-  <target name="build-pdf" depends="-build-raw-pdf,-reduce-pdf-size" description="Builds a PDF">
+  <target name="build-pdf" depends="bare-bones-html-validation,-build-pdf-and-reduce-pdf"
+          description="Builds the PDF (after building &amp; validating a bare-bones html version)" />
+  <target name="-build-pdf-and-reduce-pdf" depends="-build-raw-pdf,-reduce-pdf-size">
+    <!-- NOTE: this does everything realted to building the PDF, but skips the bare-bones-html validation -->
     <echo>Finished Building ${build.dir}/${pdf-filename}</echo>
   </target>
   <target name="-build-raw-pdf"
           depends="build-nav-data-files,resolve">
     <mkdir dir="${build.dir}/pdf-tmp"/>
-    <taskdef uri="antlib:org.asciidoctor.ant" resource="org/asciidoctor/ant/antlib.xml"
-             classpathref="tools-run-classpath"/>
-    <asciidoctor:convert
-                 sourceDirectory="${build.content.dir}/pdf"
-                 sourceDocumentName="SolrRefGuide-all.adoc"
-                 baseDir="${build.content.dir}"
-                 outputDirectory="${build.dir}/pdf-tmp"
-                 backend="pdf"
-                 extensions="adoc"
-                 sourceHighlighter="coderay"
-                 imagesDir="${build.content.dir}"
-                 doctype="book"
-                 safemode="unsafe">
-      <attribute key="section-toc" value='' /><!-- we don't use these in the pdf -->
-      <attribute key="icons" value="font" />
-      <attribute key="icon-set" value="fa" />
-      <attribute key="pdf-stylesDir" value="./pdf/themes"/>
-      <attribute key="pdf-style" value="refguide"/>
-      <attribute key="pdf-fontsDir" value="./fonts"/>
-      <attribute key="figure-caption!" value='' />
-      <attribute key="idprefix" value='' />
-      <attribute key="idseparator" value='-' />
-      <!-- attributes used in adoc files -->
-      <!-- NOTE: If you add any attributes here for use in adoc files, you almost certainly need to also add
-           them to the _config.yml.template file for building the jekyll site as well
-      -->
-      <attribute key="solr-guide-draft-status" value="${solr-guide-draft-status}" />
-      <attribute key="solr-guide-version" value="${solr-guide-version}" />
-      <attribute key="solr-docs-version" value="${solr-docs-version}" />
-      <attribute key="solr-javadocs" value="${solr-javadocs}" />
-      <attribute key="lucene-javadocs" value="${lucene-javadocs}" />
-      <attribute key="build-date" value="${DSTAMP}" />
-      <attribute key="build-year" value="${current.year}" />
-    </asciidoctor:convert>
+    <asciidoctor-convert sourceDirectory="${build.content.dir}/pdf"
+                         sourceDocumentName="SolrRefGuide-all.adoc"
+                         outputDirectory="${build.dir}/pdf-tmp"
+                         backend="pdf"
+                         />
     <move file="${build.dir}/pdf-tmp/SolrRefGuide-all.pdf" tofile="${build.dir}/pdf-tmp/RAW-${pdf-filename}" />
   </target>
+  
   <target name="-reduce-pdf-size" depends="build-init,build-tools-jar">
     <java classname="ReducePDFSize"
           failonerror="true"
@@ -232,24 +256,61 @@
        Builds site with Jekyll.
        This (for now) assumes that Jekyll (http://jekyllrb.com) is installed locally. -->
   <target name="build-site"
-          depends="-build-site,check-links-and-anchors"
+          depends="-build-site"
           description="Builds an HTML Site w/Jekyll and verifies the anchors+links are valid" >
+    <java classname="CheckLinksAndAnchors"
+          failonerror="true"
+          fork="true">
+      <classpath refid="tools-run-classpath"/>
+      <arg value="${build.dir}/html-site"/>
+    </java>
     <echo>Ready to browse site: ${build.dir}/html-site/${main-page}.html</echo>
   </target>
   <target name="-build-site"
-          depends="build-init,build-nav-data-files"
-          description="Builds an HTML Site w/Jekyll">
+          depends="build-init,build-nav-data-files" >
     <echo>Running Jekyll...</echo>
     <exec executable="jekyll" dir="${build.content.dir}">
       <arg value="build"/>
     </exec>
   </target>
+  
+  <!-- ======= HTML Bare Bones Conversion =======
+       Does a very raw converstion of the adoc files to HTML for the purpose of link & anchor checking
+       
+       Unlike the "HTML Site Build" above, this does *NOT* require Jekyll, and can be done entirely
+       With ivy deps fetched automatically (just like the PDF)
+       -->
+  <target name="bare-bones-html-validation" depends="build-init,build-nav-data-files"
+          description="Builds (w/o Jekyll) a very simple html version of the guide and runs link/anchor validation on it">
+    
+    <delete dir="${build.dir}/bare-bones-html"/>
+    <mkdir dir="${build.dir}/bare-bones-html"/>
+    <asciidoctor-convert sourceDirectory="${build.content.dir}"
+                         sourceDocumentName=""
+                         outputDirectory="${build.dir}/bare-bones-html"
+                         headerFooter="false"
+                         backend="html5"
+                         />
+
+    <java classname="CheckLinksAndAnchors"
+          failonerror="true"
+          fork="true">
+      <classpath refid="tools-run-classpath"/>
+      <arg value="${build.dir}/bare-bones-html"/>
+      <arg value="true" />
+    </java>
+    <echo>Validated Links &amp; Anchors via: ${build.dir}/bare-bones-html/</echo>
+  </target>
 
   <target name="default"
           description="Builds both a PDF and HTML versions of the ref guide"
-          depends="build-pdf,build-site">
+          depends="-build-pdf-and-reduce-pdf,build-site">
+    <!-- NOTE: we don't depend on build-pdf because then we'd also get the bare-bones HTML and do
+         link validation twice -->
     <echo>PDF: ${build.dir}/${pdf-filename}</echo>
     <echo>SITE: ${build.dir}/html-site/${main-page}.html</echo>
   </target>
 
+  
+  
 </project>
diff --git a/solr/solr-ref-guide/tools/CheckLinksAndAnchors.java b/solr/solr-ref-guide/tools/CheckLinksAndAnchors.java
index c5dcac28f07..0dc88d70f17 100644
--- a/solr/solr-ref-guide/tools/CheckLinksAndAnchors.java
+++ b/solr/solr-ref-guide/tools/CheckLinksAndAnchors.java
@@ -45,18 +45,41 @@ import org.jsoup.select.Elements;
 import org.jsoup.select.NodeVisitor;
 
 /**  
- * Check various things regarding links in the generated HTML site.
+ * Check various things regarding anchors &amp; links in the generated HTML site.
  * <p>
  * Asciidoctor doesn't do a good job of rectifying situations where multiple documents are included in one
  * massive (PDF) document may have identical anchors (either explicitly defined, or implicitly defined because of 
  * section headings).  Asciidoctor also doesn't support linking directly to another (included) document by name, 
- * unless there is an explicit '#fragement' used inthe link.
+ * unless there is an explicit '#fragement' used in the link.
  * </p>
  * <p>
  * This tool parses the generated HTML site, looking for these situations in order to fail the build -- since the 
- * equivilent PDF will be broken.  It also does sme general check of the relative URLs to ensure the destination 
+ * equivilent PDF will be broken.  It also does some general check of the relative URLs to ensure the destination 
  * files/anchors actaully exist.
  * </p>
+ * <p>
+ * This tool supports 2 modes, depending on wether you want to run it against the HTML generated by Jekyll, or 
+ * the "bare bones" HTML generated directly by asciidoctor...
+ * </p>
+ * <ul>
+ *  <li>Jekyll Mode:
+ *    <ul>
+ *      <li><code>CheckLinksAndAnchors html-dir-name/ [false]</li>
+ *      <li>Requires all html pages have a "main-content" div; ignores all links &amp; anchors that 
+ *          are <em>not</em> decendents of this div (to exclude redundent template based header, footer, &amp; sidebar links)
+ *      </li>
+ *      <li>Expects that the <code>&lt;body/&gt;</code> tag will have an <code>id</code> matching the page shortname.</li>
+ *    </ul>
+ *  </li>
+ *  <li>Bare Bones Mode:
+ *    <ul>
+ *      <li><code>CheckLinksAndAnchors html-dir-name/ true</li>
+ *      <li>Checks all links &amp; anchors in the page.</li>
+ *      <li>"Fakes" the existence of a <code>&lt;body id="..."&gt;</code> tag containing the page shortname.</li>
+ *    </ul>
+ *  </li>
+ * </ul>
+ *   
  * 
  * TODO: build a list of all known external links so that some other tool could (optionally) ping them all for 200 status?
  *
@@ -74,11 +97,12 @@ public class CheckLinksAndAnchors {
   public static void main(String[] args) throws Exception {
     int problems = 0;
     
-    if (args.length != 1) {
-      System.err.println("usage: CheckLinksAndAnchors <htmldir>");
+    if (args.length < 1 || 2 < args.length ) {
+      System.err.println("usage: CheckLinksAndAnchors <htmldir> [<bare-bones-boolean>]");
       System.exit(-1);
     }
     final File htmlDir = new File(args[0]);
+    final boolean bareBones = (2 == args.length) ? Boolean.parseBoolean(args[1]) : false;
     
     final File[] pages = htmlDir.listFiles(new HtmlFileFilter());
     if (0 == pages.length) {
@@ -89,6 +113,9 @@ public class CheckLinksAndAnchors {
     final Map<String,List<File>> idsToFiles = new HashMap<>();
     final Map<File,List<URI>> filesToRelativeLinks = new HashMap<>();
     final Set<String> idsInMultiFiles = new HashSet<>(0);
+
+    int totalLinks = 0;
+    int totalRelativeLinks = 0;
     
     for (File file : pages) {
       //System.out.println("input File URI: " + file.toURI().toString());
@@ -99,25 +126,47 @@ public class CheckLinksAndAnchors {
       
       final String fileContents = readFile(file.getPath());
       final Document doc = Jsoup.parse(fileContents);
-      // we only care about class='main-content' -- we don't want to worry
+      
+      // For Jekyll, we only care about class='main-content' -- we don't want to worry
       // about ids/links duplicated in the header/footer of every page,
-      final Element mainContent = doc.select(".main-content").first();
+      final String mainContentSelector = bareBones ? "body" : ".main-content";
+      final Element mainContent = doc.select(mainContentSelector).first();
       if (mainContent == null) {
-        throw new RuntimeException(file.getName() + " has no main-content div");
+        throw new RuntimeException(file.getName() + " has no main content: " + mainContentSelector);
       }
 
       // Add all of the IDs in (the main-content of) this doc to idsToFiles (and idsInMultiFiles if needed)
       final Elements nodesWithIds = mainContent.select("[id]");
-      // NOTE: add <body> to the nodesWithIds so we check the main section anchor as well
-      nodesWithIds.addAll(doc.select("body[id]"));
+
+      if (bareBones) {
+        // It's a pain in the ass to customize the HTML output structure asciidoctor's bare-bones html5 backend
+        // so instead we "fake" that the body tag contains the attribute we use in jekyll
+        // (and what gets added explicitly to each top level section in the PDF)
+        nodesWithIds.add(new Element(Tag.valueOf("body"), "").attr("id", file.getName().replaceAll("\\.html$","")));
+      } else {
+        // We have to add Jekyll's <body> to the nodesWithIds so we check the main section anchor as well
+        // since we've already 
+        nodesWithIds.addAll(doc.select("body[id]"));
+      }
+          
+      boolean foundPreamble = false;
       for (Element node : nodesWithIds) {
         final String id = node.id();
         assert null != id;
         assert 0 != id.length();
 
-        // special case ids that we ignore
+        // special case id: we ignore the first 'preamble' because
+        // it's part of the core markup that asciidoctor always uses
+        // if we find it a second time in a single page, fail with a special error...
         if (id.equals("preamble")) {
-          continue;
+          if (foundPreamble) {
+            problems++;
+            System.err.println(file.toURI().toString() +
+                               " contains 'preamble' anchor, this is special in jekyll and must not be used in content.");
+          } else {
+            foundPreamble = true;
+            continue;
+          }
         }
         
         if (idsToFiles.containsKey(id)) {
@@ -131,6 +180,7 @@ public class CheckLinksAndAnchors {
       // check for (relative) links that don't include a fragment
       final Elements links = mainContent.select("a[href]");
       for (Element link : links) {
+        totalLinks++;
         final String href = link.attr("href");
         if (0 == href.length()) {
           problems++;
@@ -139,6 +189,7 @@ public class CheckLinksAndAnchors {
         try {
           final URI uri = new URI(href);
           if (! uri.isAbsolute()) {
+            totalRelativeLinks++;
             final String frag = uri.getFragment();
             if (null == frag || "".equals(frag)) {
               // we must have a fragment for intra-page links to work correctly
@@ -200,7 +251,8 @@ public class CheckLinksAndAnchors {
       }
     }
 
-    
+    System.err.println("Processed " + totalLinks + " links (" + totalRelativeLinks + " relative) to " +
+                       idsToFiles.size() + " anchors in " + pages.length + " files");
     if (0 < problems) {
       System.err.println("Total of " + problems + " problems found");
       System.exit(-1);
diff --git a/solr/solr-ref-guide/tools/CustomizedAsciidoctorAntTask.java b/solr/solr-ref-guide/tools/CustomizedAsciidoctorAntTask.java
new file mode 100644
index 00000000000..5c1d700676d
--- /dev/null
+++ b/solr/solr-ref-guide/tools/CustomizedAsciidoctorAntTask.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import org.asciidoctor.ant.AsciidoctorAntTask;
+
+/**
+ * Customized version of the default AsciidoctorAntTask
+ * To deal with the fact that we want sourceDocumentName="" treated the same as unspecified (ie: null)
+ * in order to be able to wrap in a macro with defaults
+ */
+public class CustomizedAsciidoctorAntTask extends AsciidoctorAntTask {
+  @SuppressWarnings("UnusedDeclaration")
+  public void setSourceDocumentName(String sourceDocumentName) {
+    if ("".equals(sourceDocumentName)) {
+      sourceDocumentName = null;
+    }
+    super.setSourceDocumentName(sourceDocumentName);
+  }
+}
+
+ 
diff --git a/solr/solr-ref-guide/tools/asciidoctor-antlib.xml b/solr/solr-ref-guide/tools/asciidoctor-antlib.xml
new file mode 100644
index 00000000000..d67e3e15e33
--- /dev/null
+++ b/solr/solr-ref-guide/tools/asciidoctor-antlib.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+-->
+<antlib>
+   <typedef name="convert" classname="CustomizedAsciidoctorAntTask"/>
+</antlib>