add simple python script to find broken javadocs links; fix some HTML escaping in javadocs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328949 13f79535-47bb-0310-9956-ffa450edef68
2012-04-22 19:41:57 +00:00 · 2012-04-22 19:41:57 +00:00 · 422bed652e
parent efa3bdf01e
commit 422bed652e
9 changed files with 182 additions and 24 deletions
--- a/dev-tools/scripts/checkJavadocLinks.py
+++ b/dev-tools/scripts/checkJavadocLinks.py
@ -0,0 +1,156 @@
+import traceback
+import os
+import sys
+import re
+from HTMLParser import HTMLParser, HTMLParseError
+import urlparse
+
+reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
+reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)
+
+# silly emacs: '
+
+class FindHyperlinks(HTMLParser):
+
+  def __init__(self, baseURL):
+    HTMLParser.__init__(self)
+    self.anchors = set()
+    self.links = []
+    self.baseURL = baseURL
+    self.printed = False
+
+  def handle_starttag(self, tag, attrs):
+    if tag == 'a':
+      name = None
+      href = None
+      for attName, attValue in attrs:
+        if attName == 'name':
+          name = attValue
+        elif attName == 'href':
+          href = attValue
+
+      if name is not None:
+        assert href is None
+        if name in self.anchors:
+          if name in ('serializedForm',
+                      'serialized_methods',
+                      'readObject(java.io.ObjectInputStream)',
+                      'writeObject(java.io.ObjectOutputStream)') \
+                      and self.baseURL.endswith('/serialized-form.html'):
+            # Seems like a bug in Javadoc generation... you can't have
+            # same anchor name more than once...
+            pass
+          else:
+            self.printFile()
+            print '    WARNING: anchor "%s" appears more than once' % name
+        else:
+          self.anchors.add(name)
+      elif href is not None:
+        assert name is None
+        self.links.append(urlparse.urljoin(self.baseURL, href))
+      else:
+        if self.baseURL.endswith('/AttributeSource.html'):
+          # LUCENE-4010: AttributeSource's javadocs has an unescaped <A> generics!!  Seems to be a javadocs bug... (fixed in Java 7)
+          pass
+        else:
+          raise RuntimeError('BUG: %s' % attrs)
+
+  def printFile(self):
+    if not self.printed:
+      print
+      print '  ' + self.baseURL
+      self.printed = True
+                   
+def parse(baseURL, html):
+  parser = FindHyperlinks(baseURL)
+  try:
+    parser.feed(html)
+    parser.close()
+  except HTMLParseError:
+    parser.printFile()
+    print '  WARNING: failed to parse:'
+    traceback.print_exc()
+    return [], []
+  
+  #print '    %d links, %d anchors' % \
+  #      (len(parser.links), len(parser.anchors))
+  return parser.links, parser.anchors
+
+def checkAll(dirName):
+  """
+  Checks *.html (recursively) under this directory.
+  """
+
+  # Find/parse all HTML files first
+  print
+  print 'Crawl/parse...'
+  allFiles = {}
+
+  if os.path.isfile(dirName):
+    root, fileName = os.path.split(dirName)
+    iter = ((root, [], [fileName]),)
+  else:
+    iter = os.walk(dirName)
+
+  for root, dirs, files in iter:
+    for f in files:
+      main, ext = os.path.splitext(f)
+      ext = ext.lower()
+
+      # maybe?:
+      # and main not in ('serialized-form'):
+      if ext in ('.htm', '.html') and \
+         not f.startswith('.#') and \
+         main not in ('deprecated-list',):
+        # Somehow even w/ java 7 generaged javadocs,
+        # deprecated-list.html can fail to escape generics types
+        fullPath = os.path.join(root, f)
+        #print '  %s' % fullPath
+        allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f)).read())
+
+  # ... then verify:
+  print
+  print 'Verify...'
+  for fullPath, (links, anchors) in allFiles.items():
+    #print fullPath
+    printed = False
+    for link in links:
+
+      origLink = link
+
+      # TODO: use urlparse?
+      idx = link.find('#')
+      if idx != -1:
+        anchor = link[idx+1:]
+        link = link[:idx]
+      else:
+        anchor = None
+
+      idx = link.find('?')
+      if idx != -1:
+        link = link[:idx]
+        
+      # TODO: normalize path sep for windows...
+      if link.startswith('http://') or link.startswith('https://'):
+        # don't check external links
+        pass
+      elif link not in allFiles:
+        # We only load HTML... so if the link is another resource (eg
+        # SweetSpotSimilarity refs
+        # lucene/build/docs/misc/org/apache/lucene/misc/doc-files/ss.gnuplot) then it's OK:
+        if not os.path.exists(link):
+          if not printed:
+            printed = True
+            print
+            print fullPath
+          print '  BROKEN LINK: %s' % link
+      elif anchor is not None and anchor not in allFiles[link][1]:
+        if not printed:
+          printed = True
+          print
+          print fullPath
+        print '  BROKEN ANCHOR: %s' % origLink
+        
+if __name__ == '__main__':
+  checkAll(sys.argv[1])
+  
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
@ -191,7 +191,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
   * interletter values. In other words, it does something like:
   * </p>
   * <code>
-   * for(i=0; i<patterns.length; i++) {
+   * for(i=0; i&lt;patterns.length; i++) {
   * if ( word.substring(index).startsWidth(patterns[i]) )
   * update_interletter_values(patterns[i]);
   * }
--- a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
@ -54,7 +54,7 @@ import java.util.ArrayList;
 *  merge fewer segments (down to 1 at once, if that one has
 *  deletions) to keep the segment size under budget.
 *      
- *  <p<b>NOTE</b>: this policy freely merges non-adjacent
+ *  <p><b>NOTE</b>: this policy freely merges non-adjacent
 *  segments; if this is a problem, use {@link
 *  LogMergePolicy}.
 *
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
@ -103,7 +103,7 @@ import org.apache.lucene.util.SmallFloat;
 *    </table>
 *    </td></tr>
 *    <tr><td>
- *    <center><font=-1><u>VSM Score</u></font></center>
+ *    <center><font size=-1><u>VSM Score</u></font></center>
 *    </td></tr>
 *  </table>
 *  <br>&nbsp;<br>
@ -194,7 +194,7 @@ import org.apache.lucene.util.SmallFloat;
 *    </table>
 *    </td></tr>
 *    <tr><td>
- *    <center><font=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
+ *    <center><font size=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
 *    </td></tr>
 *  </table>
 *  <br>&nbsp;<br>
@ -291,7 +291,7 @@ import org.apache.lucene.util.SmallFloat;
 *  </table>
 * </td></tr>
 * <tr><td>
- *  <center><font=-1><u>Lucene Practical Scoring Function</u></font></center>
+ *  <center><font size=-1><u>Lucene Practical Scoring Function</u></font></center>
 * </td></tr>
 * </table>
 *
@ -410,7 +410,7 @@ import org.apache.lucene.util.SmallFloat;
 *      computes this value as:
 *
 *      <br>&nbsp;<br>
- *      <table cellpadding="1" cellspacing="0" border="0"n align="center" style="width:auto">
+ *      <table cellpadding="1" cellspacing="0" border="0" align="center" style="width:auto">
 *        <tr>
 *          <td valign="middle" align="right" rowspan="1">
 *            {@link org.apache.lucene.search.Weight#getValueForNormalization() sumOfSquaredWeights} &nbsp; = &nbsp;
@ -476,7 +476,7 @@ import org.apache.lucene.util.SmallFloat;
 *      If the document has multiple fields with the same name, all their boosts are multiplied together:
 *
 *      <br>&nbsp;<br>
- *      <table cellpadding="1" cellspacing="0" border="0"n align="center" style="width:auto">
+ *      <table cellpadding="1" cellspacing="0" border="0" align="center" style="width:auto">
 *        <tr>
 *          <td valign="middle" align="right" rowspan="1">
 *            norm(t,d) &nbsp; = &nbsp;
--- a/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
+++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
@ -30,7 +30,7 @@ import org.apache.lucene.util.ByteBlockPool.DirectAllocator;
 /**
 * {@link BytesRefHash} is a special purpose hash-map like data-structure
 * optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
- * byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes
+ * byte arrays to ordinal (Map&lt;BytesRef,int&gt;) storing the hashed bytes
 * efficiently in continuous storage. The mapping to the ordinal is
 * encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
 * for each added {@link BytesRef}.
--- a/lucene/core/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java
+++ b/lucene/core/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java
@ -409,7 +409,7 @@ public final class FieldCacheSanityChecker {
     * it's typically an indication of a possible problem.
     * </p>
     * <p>
-     * <bPNOTE:</b> Only the reader, fieldname, and cached value are actually 
+     * <b>NOTE:</b> Only the reader, fieldname, and cached value are actually 
     * tested -- if two cache entries have different parsers or datatypes but 
     * the cached values are the same Object (== not just equal()) this method 
     * does not consider that a red flag.  This allows for subtle variations 
--- a/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java
+++ b/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java
@ -253,7 +253,7 @@ public final class PagedBytes {
    }
  }

-  /** 1<<blockBits must be bigger than biggest single
+  /** 1&lt;&lt;blockBits must be bigger than biggest single
   *  BytesRef slice that will be pulled */
  public PagedBytes(int blockBits) {
    this.blockSize = 1 << blockBits;
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java
@ -158,7 +158,7 @@ public abstract class FacetRequest implements Cloneable {
  }

  /**
-   * If getNumLabel()<getNumResults(), only the first getNumLabel() results
+   * If getNumLabel() &lt; getNumResults(), only the first getNumLabel() results
   * will have their category paths calculated, and the rest will only be
   * available as ordinals (category numbers) and will have null paths.
   * <P>
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java
@ -1,17 +1,5 @@
 package org.apache.lucene.facet.taxonomy.writercache.cl2o;

-import java.io.BufferedInputStream;
-import java.io.BufferedOutputStream;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.apache.lucene.facet.taxonomy.CategoryPath;
-
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
@ -29,6 +17,20 @@ import org.apache.lucene.facet.taxonomy.CategoryPath;
 * limitations under the License.
 */

+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+
+// TODO: maybe this could use an FST instead...
+
 /**
 * This is a very efficient LabelToOrdinal implementation that uses a
 * CharBlockArray to store all labels and a configurable number of HashArrays to
@ -47,7 +49,7 @@ import org.apache.lucene.facet.taxonomy.CategoryPath;
 * 
 * <p>
 * This data structure has a much lower memory footprint (~30%) compared to a
- * Java HashMap<String, Integer>. It also only uses a small fraction of objects
+ * Java HashMap&lt;String, Integer&gt;. It also only uses a small fraction of objects
 * a HashMap would use, thus limiting the GC overhead. Ingestion speed was also
 * ~50% faster compared to a HashMap for 3M unique labels.
 *