add simple python script to find broken javadocs links; fix some HTML escaping in javadocs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328949 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-04-22 19:41:57 +00:00
parent efa3bdf01e
commit 422bed652e
9 changed files with 182 additions and 24 deletions

View File

@ -0,0 +1,156 @@
import traceback
import os
import sys
import re
from HTMLParser import HTMLParser, HTMLParseError
import urlparse
reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)
# silly emacs: '
class FindHyperlinks(HTMLParser):
def __init__(self, baseURL):
HTMLParser.__init__(self)
self.anchors = set()
self.links = []
self.baseURL = baseURL
self.printed = False
def handle_starttag(self, tag, attrs):
if tag == 'a':
name = None
href = None
for attName, attValue in attrs:
if attName == 'name':
name = attValue
elif attName == 'href':
href = attValue
if name is not None:
assert href is None
if name in self.anchors:
if name in ('serializedForm',
'serialized_methods',
'readObject(java.io.ObjectInputStream)',
'writeObject(java.io.ObjectOutputStream)') \
and self.baseURL.endswith('/serialized-form.html'):
# Seems like a bug in Javadoc generation... you can't have
# same anchor name more than once...
pass
else:
self.printFile()
print ' WARNING: anchor "%s" appears more than once' % name
else:
self.anchors.add(name)
elif href is not None:
assert name is None
self.links.append(urlparse.urljoin(self.baseURL, href))
else:
if self.baseURL.endswith('/AttributeSource.html'):
# LUCENE-4010: AttributeSource's javadocs has an unescaped <A> generics!! Seems to be a javadocs bug... (fixed in Java 7)
pass
else:
raise RuntimeError('BUG: %s' % attrs)
def printFile(self):
if not self.printed:
print
print ' ' + self.baseURL
self.printed = True
def parse(baseURL, html):
parser = FindHyperlinks(baseURL)
try:
parser.feed(html)
parser.close()
except HTMLParseError:
parser.printFile()
print ' WARNING: failed to parse:'
traceback.print_exc()
return [], []
#print ' %d links, %d anchors' % \
# (len(parser.links), len(parser.anchors))
return parser.links, parser.anchors
def checkAll(dirName):
"""
Checks *.html (recursively) under this directory.
"""
# Find/parse all HTML files first
print
print 'Crawl/parse...'
allFiles = {}
if os.path.isfile(dirName):
root, fileName = os.path.split(dirName)
iter = ((root, [], [fileName]),)
else:
iter = os.walk(dirName)
for root, dirs, files in iter:
for f in files:
main, ext = os.path.splitext(f)
ext = ext.lower()
# maybe?:
# and main not in ('serialized-form'):
if ext in ('.htm', '.html') and \
not f.startswith('.#') and \
main not in ('deprecated-list',):
# Somehow even w/ java 7 generaged javadocs,
# deprecated-list.html can fail to escape generics types
fullPath = os.path.join(root, f)
#print ' %s' % fullPath
allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f)).read())
# ... then verify:
print
print 'Verify...'
for fullPath, (links, anchors) in allFiles.items():
#print fullPath
printed = False
for link in links:
origLink = link
# TODO: use urlparse?
idx = link.find('#')
if idx != -1:
anchor = link[idx+1:]
link = link[:idx]
else:
anchor = None
idx = link.find('?')
if idx != -1:
link = link[:idx]
# TODO: normalize path sep for windows...
if link.startswith('http://') or link.startswith('https://'):
# don't check external links
pass
elif link not in allFiles:
# We only load HTML... so if the link is another resource (eg
# SweetSpotSimilarity refs
# lucene/build/docs/misc/org/apache/lucene/misc/doc-files/ss.gnuplot) then it's OK:
if not os.path.exists(link):
if not printed:
printed = True
print
print fullPath
print ' BROKEN LINK: %s' % link
elif anchor is not None and anchor not in allFiles[link][1]:
if not printed:
printed = True
print
print fullPath
print ' BROKEN ANCHOR: %s' % origLink
if __name__ == '__main__':
checkAll(sys.argv[1])

View File

@ -191,7 +191,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
* interletter values. In other words, it does something like:
* </p>
* <code>
* for(i=0; i<patterns.length; i++) {
* for(i=0; i&lt;patterns.length; i++) {
* if ( word.substring(index).startsWidth(patterns[i]) )
* update_interletter_values(patterns[i]);
* }

View File

@ -54,7 +54,7 @@ import java.util.ArrayList;
* merge fewer segments (down to 1 at once, if that one has
* deletions) to keep the segment size under budget.
*
* <p<b>NOTE</b>: this policy freely merges non-adjacent
* <p><b>NOTE</b>: this policy freely merges non-adjacent
* segments; if this is a problem, use {@link
* LogMergePolicy}.
*

View File

@ -103,7 +103,7 @@ import org.apache.lucene.util.SmallFloat;
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>VSM Score</u></font></center>
* <center><font size=-1><u>VSM Score</u></font></center>
* </td></tr>
* </table>
* <br>&nbsp;<br>
@ -194,7 +194,7 @@ import org.apache.lucene.util.SmallFloat;
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
* <center><font size=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
* </td></tr>
* </table>
* <br>&nbsp;<br>
@ -291,7 +291,7 @@ import org.apache.lucene.util.SmallFloat;
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>Lucene Practical Scoring Function</u></font></center>
* <center><font size=-1><u>Lucene Practical Scoring Function</u></font></center>
* </td></tr>
* </table>
*
@ -410,7 +410,7 @@ import org.apache.lucene.util.SmallFloat;
* computes this value as:
*
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0"n align="center" style="width:auto">
* <table cellpadding="1" cellspacing="0" border="0" align="center" style="width:auto">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.search.Weight#getValueForNormalization() sumOfSquaredWeights} &nbsp; = &nbsp;
@ -476,7 +476,7 @@ import org.apache.lucene.util.SmallFloat;
* If the document has multiple fields with the same name, all their boosts are multiplied together:
*
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0"n align="center" style="width:auto">
* <table cellpadding="1" cellspacing="0" border="0" align="center" style="width:auto">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* norm(t,d) &nbsp; = &nbsp;

View File

@ -30,7 +30,7 @@ import org.apache.lucene.util.ByteBlockPool.DirectAllocator;
/**
* {@link BytesRefHash} is a special purpose hash-map like data-structure
* optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
* byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes
* byte arrays to ordinal (Map&lt;BytesRef,int&gt;) storing the hashed bytes
* efficiently in continuous storage. The mapping to the ordinal is
* encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
* for each added {@link BytesRef}.

View File

@ -409,7 +409,7 @@ public final class FieldCacheSanityChecker {
* it's typically an indication of a possible problem.
* </p>
* <p>
* <bPNOTE:</b> Only the reader, fieldname, and cached value are actually
* <b>NOTE:</b> Only the reader, fieldname, and cached value are actually
* tested -- if two cache entries have different parsers or datatypes but
* the cached values are the same Object (== not just equal()) this method
* does not consider that a red flag. This allows for subtle variations

View File

@ -253,7 +253,7 @@ public final class PagedBytes {
}
}
/** 1<<blockBits must be bigger than biggest single
/** 1&lt;&lt;blockBits must be bigger than biggest single
* BytesRef slice that will be pulled */
public PagedBytes(int blockBits) {
this.blockSize = 1 << blockBits;

View File

@ -158,7 +158,7 @@ public abstract class FacetRequest implements Cloneable {
}
/**
* If getNumLabel()<getNumResults(), only the first getNumLabel() results
* If getNumLabel() &lt; getNumResults(), only the first getNumLabel() results
* will have their category paths calculated, and the rest will only be
* available as ordinals (category numbers) and will have null paths.
* <P>

View File

@ -1,17 +1,5 @@
package org.apache.lucene.facet.taxonomy.writercache.cl2o;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.facet.taxonomy.CategoryPath;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -29,6 +17,20 @@ import org.apache.lucene.facet.taxonomy.CategoryPath;
* limitations under the License.
*/
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.facet.taxonomy.CategoryPath;
// TODO: maybe this could use an FST instead...
/**
* This is a very efficient LabelToOrdinal implementation that uses a
* CharBlockArray to store all labels and a configurable number of HashArrays to
@ -47,7 +49,7 @@ import org.apache.lucene.facet.taxonomy.CategoryPath;
*
* <p>
* This data structure has a much lower memory footprint (~30%) compared to a
* Java HashMap<String, Integer>. It also only uses a small fraction of objects
* Java HashMap&lt;String, Integer&gt;. It also only uses a small fraction of objects
* a HashMap would use, thus limiting the GC overhead. Ingestion speed was also
* ~50% faster compared to a HashMap for 3M unique labels.
*