mirror of https://github.com/apache/lucene.git
add simple python script to find broken javadocs links; fix some HTML escaping in javadocs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1328949 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
efa3bdf01e
commit
422bed652e
|
@ -0,0 +1,156 @@
|
|||
import traceback
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
from HTMLParser import HTMLParser, HTMLParseError
|
||||
import urlparse
|
||||
|
||||
reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
|
||||
reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)
|
||||
|
||||
# silly emacs: '
|
||||
|
||||
class FindHyperlinks(HTMLParser):
|
||||
|
||||
def __init__(self, baseURL):
|
||||
HTMLParser.__init__(self)
|
||||
self.anchors = set()
|
||||
self.links = []
|
||||
self.baseURL = baseURL
|
||||
self.printed = False
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'a':
|
||||
name = None
|
||||
href = None
|
||||
for attName, attValue in attrs:
|
||||
if attName == 'name':
|
||||
name = attValue
|
||||
elif attName == 'href':
|
||||
href = attValue
|
||||
|
||||
if name is not None:
|
||||
assert href is None
|
||||
if name in self.anchors:
|
||||
if name in ('serializedForm',
|
||||
'serialized_methods',
|
||||
'readObject(java.io.ObjectInputStream)',
|
||||
'writeObject(java.io.ObjectOutputStream)') \
|
||||
and self.baseURL.endswith('/serialized-form.html'):
|
||||
# Seems like a bug in Javadoc generation... you can't have
|
||||
# same anchor name more than once...
|
||||
pass
|
||||
else:
|
||||
self.printFile()
|
||||
print ' WARNING: anchor "%s" appears more than once' % name
|
||||
else:
|
||||
self.anchors.add(name)
|
||||
elif href is not None:
|
||||
assert name is None
|
||||
self.links.append(urlparse.urljoin(self.baseURL, href))
|
||||
else:
|
||||
if self.baseURL.endswith('/AttributeSource.html'):
|
||||
# LUCENE-4010: AttributeSource's javadocs has an unescaped <A> generics!! Seems to be a javadocs bug... (fixed in Java 7)
|
||||
pass
|
||||
else:
|
||||
raise RuntimeError('BUG: %s' % attrs)
|
||||
|
||||
def printFile(self):
|
||||
if not self.printed:
|
||||
print
|
||||
print ' ' + self.baseURL
|
||||
self.printed = True
|
||||
|
||||
def parse(baseURL, html):
|
||||
parser = FindHyperlinks(baseURL)
|
||||
try:
|
||||
parser.feed(html)
|
||||
parser.close()
|
||||
except HTMLParseError:
|
||||
parser.printFile()
|
||||
print ' WARNING: failed to parse:'
|
||||
traceback.print_exc()
|
||||
return [], []
|
||||
|
||||
#print ' %d links, %d anchors' % \
|
||||
# (len(parser.links), len(parser.anchors))
|
||||
return parser.links, parser.anchors
|
||||
|
||||
def checkAll(dirName):
|
||||
"""
|
||||
Checks *.html (recursively) under this directory.
|
||||
"""
|
||||
|
||||
# Find/parse all HTML files first
|
||||
print
|
||||
print 'Crawl/parse...'
|
||||
allFiles = {}
|
||||
|
||||
if os.path.isfile(dirName):
|
||||
root, fileName = os.path.split(dirName)
|
||||
iter = ((root, [], [fileName]),)
|
||||
else:
|
||||
iter = os.walk(dirName)
|
||||
|
||||
for root, dirs, files in iter:
|
||||
for f in files:
|
||||
main, ext = os.path.splitext(f)
|
||||
ext = ext.lower()
|
||||
|
||||
# maybe?:
|
||||
# and main not in ('serialized-form'):
|
||||
if ext in ('.htm', '.html') and \
|
||||
not f.startswith('.#') and \
|
||||
main not in ('deprecated-list',):
|
||||
# Somehow even w/ java 7 generaged javadocs,
|
||||
# deprecated-list.html can fail to escape generics types
|
||||
fullPath = os.path.join(root, f)
|
||||
#print ' %s' % fullPath
|
||||
allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f)).read())
|
||||
|
||||
# ... then verify:
|
||||
print
|
||||
print 'Verify...'
|
||||
for fullPath, (links, anchors) in allFiles.items():
|
||||
#print fullPath
|
||||
printed = False
|
||||
for link in links:
|
||||
|
||||
origLink = link
|
||||
|
||||
# TODO: use urlparse?
|
||||
idx = link.find('#')
|
||||
if idx != -1:
|
||||
anchor = link[idx+1:]
|
||||
link = link[:idx]
|
||||
else:
|
||||
anchor = None
|
||||
|
||||
idx = link.find('?')
|
||||
if idx != -1:
|
||||
link = link[:idx]
|
||||
|
||||
# TODO: normalize path sep for windows...
|
||||
if link.startswith('http://') or link.startswith('https://'):
|
||||
# don't check external links
|
||||
pass
|
||||
elif link not in allFiles:
|
||||
# We only load HTML... so if the link is another resource (eg
|
||||
# SweetSpotSimilarity refs
|
||||
# lucene/build/docs/misc/org/apache/lucene/misc/doc-files/ss.gnuplot) then it's OK:
|
||||
if not os.path.exists(link):
|
||||
if not printed:
|
||||
printed = True
|
||||
print
|
||||
print fullPath
|
||||
print ' BROKEN LINK: %s' % link
|
||||
elif anchor is not None and anchor not in allFiles[link][1]:
|
||||
if not printed:
|
||||
printed = True
|
||||
print
|
||||
print fullPath
|
||||
print ' BROKEN ANCHOR: %s' % origLink
|
||||
|
||||
if __name__ == '__main__':
|
||||
checkAll(sys.argv[1])
|
||||
|
|
@ -191,7 +191,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
|
|||
* interletter values. In other words, it does something like:
|
||||
* </p>
|
||||
* <code>
|
||||
* for(i=0; i<patterns.length; i++) {
|
||||
* for(i=0; i<patterns.length; i++) {
|
||||
* if ( word.substring(index).startsWidth(patterns[i]) )
|
||||
* update_interletter_values(patterns[i]);
|
||||
* }
|
||||
|
|
|
@ -54,7 +54,7 @@ import java.util.ArrayList;
|
|||
* merge fewer segments (down to 1 at once, if that one has
|
||||
* deletions) to keep the segment size under budget.
|
||||
*
|
||||
* <p<b>NOTE</b>: this policy freely merges non-adjacent
|
||||
* <p><b>NOTE</b>: this policy freely merges non-adjacent
|
||||
* segments; if this is a problem, use {@link
|
||||
* LogMergePolicy}.
|
||||
*
|
||||
|
|
|
@ -103,7 +103,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* </table>
|
||||
* </td></tr>
|
||||
* <tr><td>
|
||||
* <center><font=-1><u>VSM Score</u></font></center>
|
||||
* <center><font size=-1><u>VSM Score</u></font></center>
|
||||
* </td></tr>
|
||||
* </table>
|
||||
* <br> <br>
|
||||
|
@ -194,7 +194,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* </table>
|
||||
* </td></tr>
|
||||
* <tr><td>
|
||||
* <center><font=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
|
||||
* <center><font size=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
|
||||
* </td></tr>
|
||||
* </table>
|
||||
* <br> <br>
|
||||
|
@ -291,7 +291,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* </table>
|
||||
* </td></tr>
|
||||
* <tr><td>
|
||||
* <center><font=-1><u>Lucene Practical Scoring Function</u></font></center>
|
||||
* <center><font size=-1><u>Lucene Practical Scoring Function</u></font></center>
|
||||
* </td></tr>
|
||||
* </table>
|
||||
*
|
||||
|
@ -410,7 +410,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* computes this value as:
|
||||
*
|
||||
* <br> <br>
|
||||
* <table cellpadding="1" cellspacing="0" border="0"n align="center" style="width:auto">
|
||||
* <table cellpadding="1" cellspacing="0" border="0" align="center" style="width:auto">
|
||||
* <tr>
|
||||
* <td valign="middle" align="right" rowspan="1">
|
||||
* {@link org.apache.lucene.search.Weight#getValueForNormalization() sumOfSquaredWeights} =
|
||||
|
@ -476,7 +476,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* If the document has multiple fields with the same name, all their boosts are multiplied together:
|
||||
*
|
||||
* <br> <br>
|
||||
* <table cellpadding="1" cellspacing="0" border="0"n align="center" style="width:auto">
|
||||
* <table cellpadding="1" cellspacing="0" border="0" align="center" style="width:auto">
|
||||
* <tr>
|
||||
* <td valign="middle" align="right" rowspan="1">
|
||||
* norm(t,d) =
|
||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.util.ByteBlockPool.DirectAllocator;
|
|||
/**
|
||||
* {@link BytesRefHash} is a special purpose hash-map like data-structure
|
||||
* optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
|
||||
* byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes
|
||||
* byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes
|
||||
* efficiently in continuous storage. The mapping to the ordinal is
|
||||
* encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
|
||||
* for each added {@link BytesRef}.
|
||||
|
|
|
@ -409,7 +409,7 @@ public final class FieldCacheSanityChecker {
|
|||
* it's typically an indication of a possible problem.
|
||||
* </p>
|
||||
* <p>
|
||||
* <bPNOTE:</b> Only the reader, fieldname, and cached value are actually
|
||||
* <b>NOTE:</b> Only the reader, fieldname, and cached value are actually
|
||||
* tested -- if two cache entries have different parsers or datatypes but
|
||||
* the cached values are the same Object (== not just equal()) this method
|
||||
* does not consider that a red flag. This allows for subtle variations
|
||||
|
|
|
@ -253,7 +253,7 @@ public final class PagedBytes {
|
|||
}
|
||||
}
|
||||
|
||||
/** 1<<blockBits must be bigger than biggest single
|
||||
/** 1<<blockBits must be bigger than biggest single
|
||||
* BytesRef slice that will be pulled */
|
||||
public PagedBytes(int blockBits) {
|
||||
this.blockSize = 1 << blockBits;
|
||||
|
|
|
@ -158,7 +158,7 @@ public abstract class FacetRequest implements Cloneable {
|
|||
}
|
||||
|
||||
/**
|
||||
* If getNumLabel()<getNumResults(), only the first getNumLabel() results
|
||||
* If getNumLabel() < getNumResults(), only the first getNumLabel() results
|
||||
* will have their category paths calculated, and the rest will only be
|
||||
* available as ordinals (category numbers) and will have null paths.
|
||||
* <P>
|
||||
|
|
|
@ -1,17 +1,5 @@
|
|||
package org.apache.lucene.facet.taxonomy.writercache.cl2o;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -29,6 +17,20 @@ import org.apache.lucene.facet.taxonomy.CategoryPath;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
|
||||
// TODO: maybe this could use an FST instead...
|
||||
|
||||
/**
|
||||
* This is a very efficient LabelToOrdinal implementation that uses a
|
||||
* CharBlockArray to store all labels and a configurable number of HashArrays to
|
||||
|
@ -47,7 +49,7 @@ import org.apache.lucene.facet.taxonomy.CategoryPath;
|
|||
*
|
||||
* <p>
|
||||
* This data structure has a much lower memory footprint (~30%) compared to a
|
||||
* Java HashMap<String, Integer>. It also only uses a small fraction of objects
|
||||
* Java HashMap<String, Integer>. It also only uses a small fraction of objects
|
||||
* a HashMap would use, thus limiting the GC overhead. Ingestion speed was also
|
||||
* ~50% faster compared to a HashMap for 3M unique labels.
|
||||
*
|
||||
|
|
Loading…
Reference in New Issue