mirror of
https://github.com/apache/lucene.git
synced 2025-02-28 21:39:25 +00:00
LUCENE-4220: Remove the buggy JavaCC-based HTML parser in the benchmark module and replaced by NekoHTML
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1361741 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3377e98fdb
commit
67b1fdfc5d
@ -102,6 +102,7 @@
|
||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/benchmark/lib/commons-compress-1.2.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/benchmark/lib/xercesImpl-2.9.1.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/benchmark/lib/nekohtml-1.9.15.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-cli-1.2.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/httpclient-4.1.3.jar"/>
|
||||
|
@ -29,6 +29,10 @@ API Changes
|
||||
make a custom FieldType and set indexed = true, its analyzed by the analyzer.
|
||||
(Robert Muir)
|
||||
|
||||
* LUCENE-4220: Removed the buggy JavaCC-based HTML parser in the benchmark
|
||||
module and replaced by NekoHTML. HTMLParser interface was cleaned up while
|
||||
changing method signatures. (Uwe Schindler, Robert Muir)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-4171: Performance improvements to Packed64.
|
||||
|
@ -155,6 +155,7 @@
|
||||
<fileset dir="lib">
|
||||
<include name="commons-compress-1.2.jar"/>
|
||||
<include name="xercesImpl-2.9.1.jar"/>
|
||||
<include name="nekohtml-1.9.15.jar"/>
|
||||
</fileset>
|
||||
</path>
|
||||
<path id="run.classpath">
|
||||
@ -261,20 +262,6 @@
|
||||
|
||||
<target name="init" depends="module-build.init,resolve-icu,jar-memory,jar-highlighter,jar-analyzers-common,jar-queryparser,jar-facet"/>
|
||||
|
||||
<target name="clean-javacc">
|
||||
<delete>
|
||||
<fileset dir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml" includes="*.java">
|
||||
<containsregexp expression="Generated.*By.*JavaCC"/>
|
||||
</fileset>
|
||||
</delete>
|
||||
</target>
|
||||
|
||||
<target name="javacc" depends="init,javacc-check" if="javacc.present">
|
||||
<invoke-javacc target="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj"
|
||||
outputDir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml"
|
||||
/>
|
||||
</target>
|
||||
|
||||
<target name="compile-test" depends="copy-alg-files-for-testing,module-build.compile-test"/>
|
||||
<target name="copy-alg-files-for-testing" description="copy .alg files as resources for testing">
|
||||
<copy todir="${build.dir}/classes/test/conf">
|
||||
|
@ -21,6 +21,7 @@
|
||||
<dependencies>
|
||||
<dependency org="org.apache.commons" name="commons-compress" rev="1.2" transitive="false"/>
|
||||
<dependency org="xerces" name="xercesImpl" rev="2.9.1" transitive="false"/>
|
||||
<dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.15" transitive="false"/>
|
||||
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
|
||||
</dependencies>
|
||||
</ivy-module>
|
||||
|
1
lucene/benchmark/lib/nekohtml-1.9.15.jar.sha1
Normal file
1
lucene/benchmark/lib/nekohtml-1.9.15.jar.sha1
Normal file
@ -0,0 +1 @@
|
||||
a45cd7b7401d9c2264d4908182380452c03ebf8f
|
@ -19,51 +19,203 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
|
||||
import org.cyberneko.html.parsers.SAXParser;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
/**
|
||||
* HTML Parser that is based on Lucene's demo HTML parser.
|
||||
* Simple HTML Parser extracting title, meta tags, and body text
|
||||
* that is based on <a href="http://nekohtml.sourceforge.net/">NekoHTML</a>.
|
||||
*/
|
||||
public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser {
|
||||
|
||||
public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
|
||||
org.apache.lucene.benchmark.byTask.feeds.demohtml.HTMLParser p = new org.apache.lucene.benchmark.byTask.feeds.demohtml.HTMLParser(reader);
|
||||
public class DemoHTMLParser implements HTMLParser {
|
||||
|
||||
/** The actual parser to read HTML documents */
|
||||
public static final class Parser {
|
||||
|
||||
// title
|
||||
if (title==null) {
|
||||
title = p.getTitle();
|
||||
public final Properties metaTags = new Properties();
|
||||
public final String title, body;
|
||||
|
||||
public Parser(Reader reader) throws IOException, SAXException {
|
||||
this(new InputSource(reader));
|
||||
}
|
||||
|
||||
public Parser(InputSource source) throws IOException, SAXException {
|
||||
final SAXParser parser = new SAXParser();
|
||||
parser.setFeature("http://xml.org/sax/features/namespaces", true);
|
||||
parser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
|
||||
parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
|
||||
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
|
||||
parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
|
||||
|
||||
final StringBuilder title = new StringBuilder(), body = new StringBuilder();
|
||||
final DefaultHandler handler = new DefaultHandler() {
|
||||
private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0;
|
||||
|
||||
@Override
|
||||
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
|
||||
if (inHEAD > 0) {
|
||||
if (equalsIgnoreTurkish("title", localName)) {
|
||||
inTITLE++;
|
||||
} else {
|
||||
if (equalsIgnoreTurkish("meta", localName)) {
|
||||
String name = atts.getValue("name");
|
||||
if (name == null) {
|
||||
name = atts.getValue("http-equiv");
|
||||
}
|
||||
final String val = atts.getValue("content");
|
||||
if (name != null && val != null) {
|
||||
metaTags.setProperty(name.toLowerCase(Locale.ROOT), val);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (inBODY > 0) {
|
||||
if (SUPPRESS_ELEMENTS.contains(localName)) {
|
||||
suppressed++;
|
||||
} else if (equalsIgnoreTurkish("img", localName)) {
|
||||
// the original javacc-based parser preserved <IMG alt="..."/>
|
||||
// attribute as body text in [] parenthesis:
|
||||
final String alt = atts.getValue("alt");
|
||||
if (alt != null) {
|
||||
body.append('[').append(alt).append(']');
|
||||
}
|
||||
}
|
||||
} else if (equalsIgnoreTurkish("body", localName)) {
|
||||
inBODY++;
|
||||
} else if (equalsIgnoreTurkish("head", localName)) {
|
||||
inHEAD++;
|
||||
} else if (equalsIgnoreTurkish("frameset", localName)) {
|
||||
throw new SAXException("This parser does not support HTML framesets.");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
|
||||
if (inBODY > 0) {
|
||||
if (equalsIgnoreTurkish("body", localName)) {
|
||||
inBODY--;
|
||||
} else if (ENDLINE_ELEMENTS.contains(localName)) {
|
||||
body.append('\n');
|
||||
} else if (SUPPRESS_ELEMENTS.contains(localName)) {
|
||||
suppressed--;
|
||||
}
|
||||
} else if (inHEAD > 0) {
|
||||
if (equalsIgnoreTurkish("head", localName)) {
|
||||
inHEAD--;
|
||||
} else if (inTITLE > 0 && equalsIgnoreTurkish("title", localName)) {
|
||||
inTITLE--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(char[] ch, int start, int length) throws SAXException {
|
||||
if (inBODY > 0 && suppressed == 0) {
|
||||
body.append(ch, start, length);
|
||||
} else if (inTITLE > 0) {
|
||||
title.append(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputSource resolveEntity(String publicId, String systemId) {
|
||||
// disable network access caused by DTDs
|
||||
return new InputSource(new StringReader(""));
|
||||
}
|
||||
};
|
||||
|
||||
parser.setContentHandler(handler);
|
||||
parser.setErrorHandler(handler);
|
||||
parser.parse(source);
|
||||
|
||||
// the javacc-based parser trimmed title (which should be done for HTML in all cases):
|
||||
this.title = title.toString().trim();
|
||||
|
||||
// assign body text
|
||||
this.body = body.toString();
|
||||
}
|
||||
|
||||
// TODO: remove the Turkish workaround once this is fixed in NekoHTML:
|
||||
// https://sourceforge.net/tracker/?func=detail&aid=3544334&group_id=195122&atid=952178
|
||||
|
||||
// BEGIN: workaround
|
||||
static final String convertTurkish(String s) {
|
||||
return s.replace('i', 'ı');
|
||||
}
|
||||
|
||||
static final boolean equalsIgnoreTurkish(String s1, String s2) {
|
||||
final int len1 = s1.length(), len2 = s2.length();
|
||||
if (len1 != len2)
|
||||
return false;
|
||||
for (int i = 0; i < len1; i++) {
|
||||
char ch1 = s1.charAt(i), ch2 = s2.charAt(i);
|
||||
if (ch1 == 'ı') ch1 = 'i';
|
||||
if (ch2 == 'ı') ch2 = 'i';
|
||||
if (ch1 != ch2)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// END: workaround
|
||||
|
||||
static final Set<String> createElementNameSet(String... names) {
|
||||
final HashSet<String> set = new HashSet<String>();
|
||||
for (final String name : names) {
|
||||
set.add(name);
|
||||
set.add(convertTurkish(name));
|
||||
}
|
||||
return Collections.unmodifiableSet(set);
|
||||
}
|
||||
|
||||
/** HTML elements that cause a line break (they are block-elements) */
|
||||
static final Set<String> ENDLINE_ELEMENTS = createElementNameSet(
|
||||
"p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
|
||||
"pre", "hr", "blockquote", "address", "fieldset", "table", "form",
|
||||
"noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option"
|
||||
);
|
||||
|
||||
/** HTML elements with contents that are ignored */
|
||||
static final Set<String> SUPPRESS_ELEMENTS = createElementNameSet(
|
||||
"style", "script"
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException {
|
||||
try {
|
||||
return parse(docData, name, date, new InputSource(reader), trecSrc);
|
||||
} catch (SAXException saxe) {
|
||||
throw new IOException("SAX exception occurred while parsing HTML document.", saxe);
|
||||
}
|
||||
}
|
||||
|
||||
public DocData parse(DocData docData, String name, Date date, InputSource source, TrecContentSource trecSrc) throws IOException, SAXException {
|
||||
final Parser p = new Parser(source);
|
||||
|
||||
// properties
|
||||
Properties props = p.getMetaTags();
|
||||
// body
|
||||
Reader r = p.getReader();
|
||||
char c[] = new char[1024];
|
||||
StringBuilder bodyBuf = new StringBuilder();
|
||||
int n;
|
||||
while ((n = r.read(c)) >= 0) {
|
||||
if (n>0) {
|
||||
bodyBuf.append(c,0,n);
|
||||
}
|
||||
}
|
||||
r.close();
|
||||
if (date == null && props.getProperty("date")!=null) {
|
||||
try {
|
||||
date = dateFormat.parse(props.getProperty("date").trim());
|
||||
} catch (ParseException e) {
|
||||
// do not fail test just because a date could not be parsed
|
||||
System.out.println("ignoring date parse exception (assigning 'now') for: "+props.getProperty("date"));
|
||||
date = new Date(); // now
|
||||
final Properties props = p.metaTags;
|
||||
String dateStr = props.getProperty("date");
|
||||
if (dateStr != null) {
|
||||
final Date newDate = trecSrc.parseDate(dateStr);
|
||||
if (newDate != null) {
|
||||
date = newDate;
|
||||
}
|
||||
}
|
||||
|
||||
docData.clear();
|
||||
docData.setName(name);
|
||||
docData.setBody(bodyBuf.toString());
|
||||
docData.setTitle(title);
|
||||
docData.setBody(p.body);
|
||||
docData.setTitle(p.title);
|
||||
docData.setProps(props);
|
||||
docData.setDate(date);
|
||||
return docData;
|
||||
|
@ -19,7 +19,6 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.text.DateFormat;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
@ -34,13 +33,11 @@ public interface HTMLParser {
|
||||
* @param docData result reused
|
||||
* @param name name of the result doc data.
|
||||
* @param date date of the result doc data. If null, attempt to set by parsed data.
|
||||
* @param title title of the result doc data. If null, attempt to set by parsed data.
|
||||
* @param reader reader of html text to parse.
|
||||
* @param dateFormat date formatter to use for extracting the date.
|
||||
* @param trecSrc the {@link TrecContentSource} used to parse dates.
|
||||
* @return Parsed doc data.
|
||||
* @throws IOException
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
|
||||
public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException;
|
||||
|
||||
}
|
||||
|
@ -22,7 +22,6 @@ import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParsePosition;
|
||||
import java.text.SimpleDateFormat;
|
||||
@ -33,8 +32,6 @@ import java.util.Locale;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
|
||||
import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader;
|
||||
import org.apache.lucene.util.ThreadInterruptedException;
|
||||
|
||||
/**
|
||||
* Implements a {@link ContentSource} over the TREC collection.
|
||||
@ -57,7 +54,7 @@ import org.apache.lucene.util.ThreadInterruptedException;
|
||||
*/
|
||||
public class TrecContentSource extends ContentSource {
|
||||
|
||||
private static final class DateFormatInfo {
|
||||
static final class DateFormatInfo {
|
||||
DateFormat[] dfs;
|
||||
ParsePosition pos;
|
||||
}
|
||||
@ -83,13 +80,10 @@ public class TrecContentSource extends ContentSource {
|
||||
};
|
||||
|
||||
private ThreadLocal<DateFormatInfo> dateFormats = new ThreadLocal<DateFormatInfo>();
|
||||
private ThreadLocal<StringBuilderReader> trecDocReader = new ThreadLocal<StringBuilderReader>();
|
||||
private ThreadLocal<StringBuilder> trecDocBuffer = new ThreadLocal<StringBuilder>();
|
||||
private File dataDir = null;
|
||||
private ArrayList<File> inputFiles = new ArrayList<File>();
|
||||
private int nextFile = 0;
|
||||
private int rawDocSize = 0;
|
||||
|
||||
// Use to synchronize threads on reading from the TREC documents.
|
||||
private Object lock = new Object();
|
||||
|
||||
@ -126,17 +120,6 @@ public class TrecContentSource extends ContentSource {
|
||||
return sb;
|
||||
}
|
||||
|
||||
Reader getTrecDocReader(StringBuilder docBuffer) {
|
||||
StringBuilderReader r = trecDocReader.get();
|
||||
if (r == null) {
|
||||
r = new StringBuilderReader(docBuffer);
|
||||
trecDocReader.set(r);
|
||||
} else {
|
||||
r.set(docBuffer);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
HTMLParser getHtmlParser() {
|
||||
return htmlParser;
|
||||
}
|
||||
@ -161,7 +144,7 @@ public class TrecContentSource extends ContentSource {
|
||||
continue;
|
||||
}
|
||||
|
||||
rawDocSize += line.length();
|
||||
line.length();
|
||||
|
||||
if (lineStart!=null && line.startsWith(lineStart)) {
|
||||
if (collectMatchLine) {
|
||||
@ -287,12 +270,8 @@ public class TrecContentSource extends ContentSource {
|
||||
|
||||
// This code segment relies on HtmlParser being thread safe. When we get
|
||||
// here, everything else is already private to that thread, so we're safe.
|
||||
try {
|
||||
docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
|
||||
addItem();
|
||||
} catch (InterruptedException ie) {
|
||||
throw new ThreadInterruptedException(ie);
|
||||
}
|
||||
docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
|
||||
addItem();
|
||||
|
||||
return docData;
|
||||
}
|
||||
|
@ -80,7 +80,7 @@ public abstract class TrecDocParser {
|
||||
* parsers to alter their behavior according to the file path type.
|
||||
*/
|
||||
public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException;
|
||||
|
||||
/**
|
||||
* strip tags from <code>buf</code>: each tag is replaced by a single blank.
|
||||
|
@ -37,7 +37,7 @@ public class TrecFBISParser extends TrecDocParser {
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException {
|
||||
int mark = 0; // that much is skipped
|
||||
// optionally skip some of the text, set date, title
|
||||
Date date = null;
|
||||
|
@ -41,7 +41,7 @@ public class TrecFR94Parser extends TrecDocParser {
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException {
|
||||
int mark = 0; // that much is skipped
|
||||
// optionally skip some of the text, set date (no title?)
|
||||
Date date = null;
|
||||
|
@ -33,7 +33,7 @@ public class TrecFTParser extends TrecDocParser {
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException {
|
||||
int mark = 0; // that much is skipped
|
||||
|
||||
// date...
|
||||
|
@ -18,7 +18,7 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
@ -31,29 +31,24 @@ public class TrecGov2Parser extends TrecDocParser {
|
||||
|
||||
private static final String DOCHDR = "<DOCHDR>";
|
||||
private static final String TERMINATING_DOCHDR = "</DOCHDR>";
|
||||
private static final int TERMINATING_DOCHDR_LENGTH = TERMINATING_DOCHDR.length();
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
// Set up a (per-thread) reused Reader over the read content, reset it to re-read from docBuf
|
||||
Reader r = trecSrc.getTrecDocReader(docBuf);
|
||||
|
||||
// skip some of the text, optionally set date
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException {
|
||||
// skip some of the non-html text, optionally set date
|
||||
Date date = null;
|
||||
int h1 = docBuf.indexOf(DOCHDR);
|
||||
if (h1>=0) {
|
||||
int h2 = docBuf.indexOf(TERMINATING_DOCHDR,h1);
|
||||
String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
|
||||
int start = 0;
|
||||
final int h1 = docBuf.indexOf(DOCHDR);
|
||||
if (h1 >= 0) {
|
||||
final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1);
|
||||
final String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
|
||||
if (dateStr != null) {
|
||||
date = trecSrc.parseDate(dateStr);
|
||||
}
|
||||
r.mark(h2+TERMINATING_DOCHDR_LENGTH);
|
||||
start = h2 + TERMINATING_DOCHDR.length();
|
||||
}
|
||||
|
||||
r.reset();
|
||||
HTMLParser htmlParser = trecSrc.getHtmlParser();
|
||||
return htmlParser.parse(docData, name, date, null, r, null);
|
||||
final String html = docBuf.substring(start);
|
||||
return trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -36,7 +36,7 @@ public class TrecLATimesParser extends TrecDocParser {
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException {
|
||||
int mark = 0; // that much is skipped
|
||||
|
||||
// date...
|
||||
|
@ -26,7 +26,7 @@ public class TrecParserByPath extends TrecDocParser {
|
||||
|
||||
@Override
|
||||
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
|
||||
StringBuilder docBuf, ParsePathType pathType) throws IOException {
|
||||
return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType);
|
||||
}
|
||||
|
||||
|
@ -1,112 +0,0 @@
|
||||
/* Generated By:JavaCC: Do not edit this line. CharStream.java Version 4.1 */
|
||||
/* JavaCCOptions:STATIC=false */
|
||||
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
||||
|
||||
/**
|
||||
* This interface describes a character stream that maintains line and
|
||||
* column number positions of the characters. It also has the capability
|
||||
* to backup the stream to some extent. An implementation of this
|
||||
* interface is used in the TokenManager implementation generated by
|
||||
* JavaCCParser.
|
||||
*
|
||||
* All the methods except backup can be implemented in any fashion. backup
|
||||
* needs to be implemented correctly for the correct operation of the lexer.
|
||||
* Rest of the methods are all used to get information like line number,
|
||||
* column number and the String that constitutes a token and are not used
|
||||
* by the lexer. Hence their implementation won't affect the generated lexer's
|
||||
* operation.
|
||||
*/
|
||||
|
||||
public interface CharStream {
|
||||
|
||||
/**
|
||||
* Returns the next character from the selected input. The method
|
||||
* of selecting the input is the responsibility of the class
|
||||
* implementing this interface. Can throw any java.io.IOException.
|
||||
*/
|
||||
char readChar() throws java.io.IOException;
|
||||
|
||||
/**
|
||||
* Returns the column position of the character last read.
|
||||
* @deprecated
|
||||
* @see #getEndColumn
|
||||
*/
|
||||
int getColumn();
|
||||
|
||||
/**
|
||||
* Returns the line number of the character last read.
|
||||
* @deprecated
|
||||
* @see #getEndLine
|
||||
*/
|
||||
int getLine();
|
||||
|
||||
/**
|
||||
* Returns the column number of the last character for current token (being
|
||||
* matched after the last call to BeginTOken).
|
||||
*/
|
||||
int getEndColumn();
|
||||
|
||||
/**
|
||||
* Returns the line number of the last character for current token (being
|
||||
* matched after the last call to BeginTOken).
|
||||
*/
|
||||
int getEndLine();
|
||||
|
||||
/**
|
||||
* Returns the column number of the first character for current token (being
|
||||
* matched after the last call to BeginTOken).
|
||||
*/
|
||||
int getBeginColumn();
|
||||
|
||||
/**
|
||||
* Returns the line number of the first character for current token (being
|
||||
* matched after the last call to BeginTOken).
|
||||
*/
|
||||
int getBeginLine();
|
||||
|
||||
/**
|
||||
* Backs up the input stream by amount steps. Lexer calls this method if it
|
||||
* had already read some characters, but could not use them to match a
|
||||
* (longer) token. So, they will be used again as the prefix of the next
|
||||
* token and it is the implemetation's responsibility to do this right.
|
||||
*/
|
||||
void backup(int amount);
|
||||
|
||||
/**
|
||||
* Returns the next character that marks the beginning of the next token.
|
||||
* All characters must remain in the buffer between two successive calls
|
||||
* to this method to implement backup correctly.
|
||||
*/
|
||||
char BeginToken() throws java.io.IOException;
|
||||
|
||||
/**
|
||||
* Returns a string made up of characters from the marked token beginning
|
||||
* to the current buffer position. Implementations have the choice of returning
|
||||
* anything that they want to. For example, for efficiency, one might decide
|
||||
* to just return null, which is a valid implementation.
|
||||
*/
|
||||
String GetImage();
|
||||
|
||||
/**
|
||||
* Returns an array of characters that make up the suffix of length 'len' for
|
||||
* the currently matched token. This is used to build up the matched string
|
||||
* for use in actions in the case of MORE. A simple and inefficient
|
||||
* implementation of this is as follows :
|
||||
*
|
||||
* {
|
||||
* String t = GetImage();
|
||||
* return t.substring(t.length() - len, t.length()).toCharArray();
|
||||
* }
|
||||
*/
|
||||
char[] GetSuffix(int len);
|
||||
|
||||
/**
|
||||
* The lexer calls this function to indicate that it is done with the stream
|
||||
* and hence implementations can free any resources held by this class.
|
||||
* Again, the body of this function can be just empty and it will not
|
||||
* affect the lexer's operation.
|
||||
*/
|
||||
void Done();
|
||||
|
||||
}
|
||||
/* JavaCC - OriginalChecksum=e26d9399cd34335f985e19c1fa86c11b (do not edit this line) */
|
@ -1,330 +0,0 @@
|
||||
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Utility class for encoding and decoding HTML entities.
|
||||
*/
|
||||
public class Entities {
|
||||
static final Map<String,String> decoder = new HashMap<String,String>(300);
|
||||
static final String[] encoder = new String[0x100];
|
||||
|
||||
static final String decode(String entity) {
|
||||
if (entity.charAt(entity.length()-1) == ';') // remove trailing semicolon
|
||||
entity = entity.substring(0, entity.length()-1);
|
||||
if (entity.charAt(1) == '#') {
|
||||
int start = 2;
|
||||
int radix = 10;
|
||||
if (entity.charAt(2) == 'X' || entity.charAt(2) == 'x') {
|
||||
start++;
|
||||
radix = 16;
|
||||
}
|
||||
Character c =
|
||||
new Character((char)Integer.parseInt(entity.substring(start), radix));
|
||||
return c.toString();
|
||||
} else {
|
||||
String s = decoder.get(entity);
|
||||
if (s != null)
|
||||
return s;
|
||||
else return "";
|
||||
}
|
||||
}
|
||||
|
||||
public static final String encode(String s) {
|
||||
int length = s.length();
|
||||
StringBuffer buffer = new StringBuffer(length * 2);
|
||||
for (int i = 0; i < length; i++) {
|
||||
int j = s.charAt(i);
|
||||
if (j < 0x100 && encoder[j] != null) {
|
||||
buffer.append(encoder[j]); // have a named encoding
|
||||
buffer.append(';');
|
||||
} else if (j < 0x80) {
|
||||
buffer.append((char) j); // use ASCII value
|
||||
} else {
|
||||
buffer.append("&#"); // use numeric encoding
|
||||
buffer.append(j).append(';');
|
||||
}
|
||||
}
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
static final void add(String entity, int value) {
|
||||
decoder.put(entity, (new Character((char)value)).toString());
|
||||
if (value < 0x100)
|
||||
encoder[value] = entity;
|
||||
}
|
||||
|
||||
static {
|
||||
add(" ", 160);
|
||||
add("¡", 161);
|
||||
add("¢", 162);
|
||||
add("£", 163);
|
||||
add("¤", 164);
|
||||
add("¥", 165);
|
||||
add("¦", 166);
|
||||
add("§", 167);
|
||||
add("¨", 168);
|
||||
add("©", 169);
|
||||
add("ª", 170);
|
||||
add("«", 171);
|
||||
add("¬", 172);
|
||||
add("­", 173);
|
||||
add("®", 174);
|
||||
add("¯", 175);
|
||||
add("°", 176);
|
||||
add("±", 177);
|
||||
add("²", 178);
|
||||
add("³", 179);
|
||||
add("´", 180);
|
||||
add("µ", 181);
|
||||
add("¶", 182);
|
||||
add("·", 183);
|
||||
add("¸", 184);
|
||||
add("¹", 185);
|
||||
add("º", 186);
|
||||
add("»", 187);
|
||||
add("¼", 188);
|
||||
add("½", 189);
|
||||
add("¾", 190);
|
||||
add("¿", 191);
|
||||
add("À", 192);
|
||||
add("Á", 193);
|
||||
add("Â", 194);
|
||||
add("Ã", 195);
|
||||
add("Ä", 196);
|
||||
add("Å", 197);
|
||||
add("Æ", 198);
|
||||
add("Ç", 199);
|
||||
add("È", 200);
|
||||
add("É", 201);
|
||||
add("Ê", 202);
|
||||
add("Ë", 203);
|
||||
add("Ì", 204);
|
||||
add("Í", 205);
|
||||
add("Î", 206);
|
||||
add("Ï", 207);
|
||||
add("Ð", 208);
|
||||
add("Ñ", 209);
|
||||
add("Ò", 210);
|
||||
add("Ó", 211);
|
||||
add("Ô", 212);
|
||||
add("Õ", 213);
|
||||
add("Ö", 214);
|
||||
add("×", 215);
|
||||
add("Ø", 216);
|
||||
add("Ù", 217);
|
||||
add("Ú", 218);
|
||||
add("Û", 219);
|
||||
add("Ü", 220);
|
||||
add("Ý", 221);
|
||||
add("Þ", 222);
|
||||
add("ß", 223);
|
||||
add("à", 224);
|
||||
add("á", 225);
|
||||
add("â", 226);
|
||||
add("ã", 227);
|
||||
add("ä", 228);
|
||||
add("å", 229);
|
||||
add("æ", 230);
|
||||
add("ç", 231);
|
||||
add("è", 232);
|
||||
add("é", 233);
|
||||
add("ê", 234);
|
||||
add("ë", 235);
|
||||
add("ì", 236);
|
||||
add("í", 237);
|
||||
add("î", 238);
|
||||
add("ï", 239);
|
||||
add("ð", 240);
|
||||
add("ñ", 241);
|
||||
add("ò", 242);
|
||||
add("ó", 243);
|
||||
add("ô", 244);
|
||||
add("õ", 245);
|
||||
add("ö", 246);
|
||||
add("÷", 247);
|
||||
add("ø", 248);
|
||||
add("ù", 249);
|
||||
add("ú", 250);
|
||||
add("û", 251);
|
||||
add("ü", 252);
|
||||
add("ý", 253);
|
||||
add("þ", 254);
|
||||
add("ÿ", 255);
|
||||
add("&fnof", 402);
|
||||
add("&Alpha", 913);
|
||||
add("&Beta", 914);
|
||||
add("&Gamma", 915);
|
||||
add("&Delta", 916);
|
||||
add("&Epsilon",917);
|
||||
add("&Zeta", 918);
|
||||
add("&Eta", 919);
|
||||
add("&Theta", 920);
|
||||
add("&Iota", 921);
|
||||
add("&Kappa", 922);
|
||||
add("&Lambda", 923);
|
||||
add("&Mu", 924);
|
||||
add("&Nu", 925);
|
||||
add("&Xi", 926);
|
||||
add("&Omicron",927);
|
||||
add("&Pi", 928);
|
||||
add("&Rho", 929);
|
||||
add("&Sigma", 931);
|
||||
add("&Tau", 932);
|
||||
add("&Upsilon",933);
|
||||
add("&Phi", 934);
|
||||
add("&Chi", 935);
|
||||
add("&Psi", 936);
|
||||
add("&Omega", 937);
|
||||
add("&alpha", 945);
|
||||
add("&beta", 946);
|
||||
add("&gamma", 947);
|
||||
add("&delta", 948);
|
||||
add("&epsilon",949);
|
||||
add("&zeta", 950);
|
||||
add("&eta", 951);
|
||||
add("&theta", 952);
|
||||
add("&iota", 953);
|
||||
add("&kappa", 954);
|
||||
add("&lambda", 955);
|
||||
add("&mu", 956);
|
||||
add("&nu", 957);
|
||||
add("&xi", 958);
|
||||
add("&omicron",959);
|
||||
add("&pi", 960);
|
||||
add("&rho", 961);
|
||||
add("&sigmaf", 962);
|
||||
add("&sigma", 963);
|
||||
add("&tau", 964);
|
||||
add("&upsilon",965);
|
||||
add("&phi", 966);
|
||||
add("&chi", 967);
|
||||
add("&psi", 968);
|
||||
add("&omega", 969);
|
||||
add("&thetasym",977);
|
||||
add("&upsih", 978);
|
||||
add("&piv", 982);
|
||||
add("&bull", 8226);
|
||||
add("&hellip", 8230);
|
||||
add("&prime", 8242);
|
||||
add("&Prime", 8243);
|
||||
add("&oline", 8254);
|
||||
add("&frasl", 8260);
|
||||
add("&weierp", 8472);
|
||||
add("&image", 8465);
|
||||
add("&real", 8476);
|
||||
add("&trade", 8482);
|
||||
add("&alefsym",8501);
|
||||
add("&larr", 8592);
|
||||
add("&uarr", 8593);
|
||||
add("&rarr", 8594);
|
||||
add("&darr", 8595);
|
||||
add("&harr", 8596);
|
||||
add("&crarr", 8629);
|
||||
add("&lArr", 8656);
|
||||
add("&uArr", 8657);
|
||||
add("&rArr", 8658);
|
||||
add("&dArr", 8659);
|
||||
add("&hArr", 8660);
|
||||
add("&forall", 8704);
|
||||
add("&part", 8706);
|
||||
add("&exist", 8707);
|
||||
add("&empty", 8709);
|
||||
add("&nabla", 8711);
|
||||
add("&isin", 8712);
|
||||
add("¬in", 8713);
|
||||
add("&ni", 8715);
|
||||
add("&prod", 8719);
|
||||
add("&sum", 8721);
|
||||
add("&minus", 8722);
|
||||
add("&lowast", 8727);
|
||||
add("&radic", 8730);
|
||||
add("&prop", 8733);
|
||||
add("&infin", 8734);
|
||||
add("&ang", 8736);
|
||||
add("&and", 8743);
|
||||
add("&or", 8744);
|
||||
add("&cap", 8745);
|
||||
add("&cup", 8746);
|
||||
add("&int", 8747);
|
||||
add("&there4", 8756);
|
||||
add("&sim", 8764);
|
||||
add("&cong", 8773);
|
||||
add("&asymp", 8776);
|
||||
add("&ne", 8800);
|
||||
add("&equiv", 8801);
|
||||
add("&le", 8804);
|
||||
add("&ge", 8805);
|
||||
add("&sub", 8834);
|
||||
add("&sup", 8835);
|
||||
add("&nsub", 8836);
|
||||
add("&sube", 8838);
|
||||
add("&supe", 8839);
|
||||
add("&oplus", 8853);
|
||||
add("&otimes", 8855);
|
||||
add("&perp", 8869);
|
||||
add("&sdot", 8901);
|
||||
add("&lceil", 8968);
|
||||
add("&rceil", 8969);
|
||||
add("&lfloor", 8970);
|
||||
add("&rfloor", 8971);
|
||||
add("&lang", 9001);
|
||||
add("&rang", 9002);
|
||||
add("&loz", 9674);
|
||||
add("&spades", 9824);
|
||||
add("&clubs", 9827);
|
||||
add("&hearts", 9829);
|
||||
add("&diams", 9830);
|
||||
add(""", 34);
|
||||
add("&", 38);
|
||||
add("<", 60);
|
||||
add(">", 62);
|
||||
add("&OElig", 338);
|
||||
add("&oelig", 339);
|
||||
add("&Scaron", 352);
|
||||
add("&scaron", 353);
|
||||
add("&Yuml", 376);
|
||||
add("&circ", 710);
|
||||
add("&tilde", 732);
|
||||
add("&ensp", 8194);
|
||||
add("&emsp", 8195);
|
||||
add("&thinsp", 8201);
|
||||
add("&zwnj", 8204);
|
||||
add("&zwj", 8205);
|
||||
add("&lrm", 8206);
|
||||
add("&rlm", 8207);
|
||||
add("&ndash", 8211);
|
||||
add("&mdash", 8212);
|
||||
add("&lsquo", 8216);
|
||||
add("&rsquo", 8217);
|
||||
add("&sbquo", 8218);
|
||||
add("&ldquo", 8220);
|
||||
add("&rdquo", 8221);
|
||||
add("&bdquo", 8222);
|
||||
add("&dagger", 8224);
|
||||
add("&Dagger", 8225);
|
||||
add("&permil", 8240);
|
||||
add("&lsaquo", 8249);
|
||||
add("&rsaquo", 8250);
|
||||
add("&euro", 8364);
|
||||
|
||||
}
|
||||
}
|
@ -1,123 +0,0 @@
|
||||
// FastCharStream.java
|
||||
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/** An efficient implementation of JavaCC's CharStream interface. <p>Note that
|
||||
* this does not do line-number counting, but instead keeps track of the
|
||||
* character position of the token in the input, as required by Lucene's {@link
|
||||
* org.apache.lucene.analysis.Token} API.
|
||||
* */
|
||||
public final class FastCharStream implements CharStream {
|
||||
char[] buffer = null;
|
||||
|
||||
int bufferLength = 0; // end of valid chars
|
||||
int bufferPosition = 0; // next char to read
|
||||
|
||||
int tokenStart = 0; // offset in buffer
|
||||
int bufferStart = 0; // position in file of buffer
|
||||
|
||||
Reader input; // source of chars
|
||||
|
||||
/** Constructs from a Reader. */
|
||||
public FastCharStream(Reader r) {
|
||||
input = r;
|
||||
}
|
||||
|
||||
public final char readChar() throws IOException {
|
||||
if (bufferPosition >= bufferLength)
|
||||
refill();
|
||||
return buffer[bufferPosition++];
|
||||
}
|
||||
|
||||
private final void refill() throws IOException {
|
||||
int newPosition = bufferLength - tokenStart;
|
||||
|
||||
if (tokenStart == 0) { // token won't fit in buffer
|
||||
if (buffer == null) { // first time: alloc buffer
|
||||
buffer = new char[2048];
|
||||
} else if (bufferLength == buffer.length) { // grow buffer
|
||||
char[] newBuffer = new char[buffer.length*2];
|
||||
System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
|
||||
buffer = newBuffer;
|
||||
}
|
||||
} else { // shift token to front
|
||||
System.arraycopy(buffer, tokenStart, buffer, 0, newPosition);
|
||||
}
|
||||
|
||||
bufferLength = newPosition; // update state
|
||||
bufferPosition = newPosition;
|
||||
bufferStart += tokenStart;
|
||||
tokenStart = 0;
|
||||
|
||||
int charsRead = // fill space in buffer
|
||||
input.read(buffer, newPosition, buffer.length-newPosition);
|
||||
if (charsRead == -1)
|
||||
throw new IOException("read past eof");
|
||||
else
|
||||
bufferLength += charsRead;
|
||||
}
|
||||
|
||||
public final char BeginToken() throws IOException {
|
||||
tokenStart = bufferPosition;
|
||||
return readChar();
|
||||
}
|
||||
|
||||
public final void backup(int amount) {
|
||||
bufferPosition -= amount;
|
||||
}
|
||||
|
||||
public final String GetImage() {
|
||||
return new String(buffer, tokenStart, bufferPosition - tokenStart);
|
||||
}
|
||||
|
||||
public final char[] GetSuffix(int len) {
|
||||
char[] value = new char[len];
|
||||
System.arraycopy(buffer, bufferPosition - len, value, 0, len);
|
||||
return value;
|
||||
}
|
||||
|
||||
public final void Done() {
|
||||
try {
|
||||
input.close();
|
||||
} catch (IOException e) {
|
||||
}
|
||||
}
|
||||
|
||||
public final int getColumn() {
|
||||
return bufferStart + bufferPosition;
|
||||
}
|
||||
public final int getLine() {
|
||||
return 1;
|
||||
}
|
||||
public final int getEndColumn() {
|
||||
return bufferStart + bufferPosition;
|
||||
}
|
||||
public final int getEndLine() {
|
||||
return 1;
|
||||
}
|
||||
public final int getBeginColumn() {
|
||||
return bufferStart + tokenStart;
|
||||
}
|
||||
public final int getBeginLine() {
|
||||
return 1;
|
||||
}
|
||||
}
|
@ -1,722 +0,0 @@
|
||||
/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */
|
||||
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Locale;
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* Basic html parser (for demo/testing purposes only!)
|
||||
*/
|
||||
public class HTMLParser implements HTMLParserConstants {
|
||||
public static int SUMMARY_LENGTH = 200;
|
||||
|
||||
StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
|
||||
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
|
||||
Properties metaTags=new Properties();
|
||||
String currentMetaTag=null;
|
||||
String currentMetaContent=null;
|
||||
int length = 0;
|
||||
boolean titleComplete = false;
|
||||
boolean inTitle = false;
|
||||
boolean inMetaTag = false;
|
||||
boolean inStyle = false;
|
||||
boolean afterTag = false;
|
||||
boolean afterSpace = false;
|
||||
String eol = System.getProperty("line.separator");
|
||||
Reader pipeIn = null;
|
||||
Writer pipeOut;
|
||||
private MyPipedInputStream pipeInStream = null;
|
||||
private PipedOutputStream pipeOutStream = null;
|
||||
|
||||
public HTMLParser(Reader reader) {
|
||||
this(new FastCharStream(reader));
|
||||
}
|
||||
|
||||
private class MyPipedInputStream extends PipedInputStream{
|
||||
|
||||
public MyPipedInputStream(){
|
||||
super();
|
||||
}
|
||||
|
||||
public MyPipedInputStream(PipedOutputStream src) throws IOException{
|
||||
super(src);
|
||||
}
|
||||
|
||||
public boolean full() throws IOException{
|
||||
return this.available() >= PipedInputStream.PIPE_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
public String getTitle() throws IOException, InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
while (true) {
|
||||
synchronized(this) {
|
||||
if (titleComplete || pipeInStream.full())
|
||||
break;
|
||||
wait(10);
|
||||
}
|
||||
}
|
||||
return title.toString().trim();
|
||||
}
|
||||
|
||||
public Properties getMetaTags() throws IOException,
|
||||
InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
while (true) {
|
||||
synchronized(this) {
|
||||
if (titleComplete || pipeInStream.full())
|
||||
break;
|
||||
wait(10);
|
||||
}
|
||||
}
|
||||
return metaTags;
|
||||
}
|
||||
|
||||
|
||||
public String getSummary() throws IOException, InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
while (true) {
|
||||
synchronized(this) {
|
||||
if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
|
||||
break;
|
||||
wait(10);
|
||||
}
|
||||
}
|
||||
if (summary.length() > SUMMARY_LENGTH)
|
||||
summary.setLength(SUMMARY_LENGTH);
|
||||
|
||||
String sum = summary.toString().trim();
|
||||
String tit = getTitle();
|
||||
if (sum.equals(""))
|
||||
return tit;
|
||||
else
|
||||
return sum;
|
||||
}
|
||||
|
||||
public Reader getReader() throws IOException {
|
||||
if (pipeIn == null) {
|
||||
pipeInStream = new MyPipedInputStream();
|
||||
pipeOutStream = new PipedOutputStream(pipeInStream);
|
||||
pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
|
||||
pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
|
||||
|
||||
Thread thread = new ParserThread(this);
|
||||
thread.start(); // start parsing
|
||||
}
|
||||
|
||||
return pipeIn;
|
||||
}
|
||||
|
||||
void addToSummary(String text) {
|
||||
if (summary.length() < SUMMARY_LENGTH) {
|
||||
summary.append(text);
|
||||
if (summary.length() >= SUMMARY_LENGTH) {
|
||||
synchronized(this) {
|
||||
notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void addText(String text) throws IOException {
|
||||
if (inStyle)
|
||||
return;
|
||||
if (inTitle)
|
||||
title.append(text);
|
||||
else {
|
||||
addToSummary(text);
|
||||
if (!titleComplete && !(title.length() == 0)) { // finished title
|
||||
synchronized(this) {
|
||||
titleComplete = true; // tell waiting threads
|
||||
notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
length += text.length();
|
||||
pipeOut.write(text);
|
||||
|
||||
afterSpace = false;
|
||||
}
|
||||
|
||||
void addMetaTag() {
|
||||
metaTags.setProperty(currentMetaTag, currentMetaContent);
|
||||
currentMetaTag = null;
|
||||
currentMetaContent = null;
|
||||
return;
|
||||
}
|
||||
|
||||
void addSpace() throws IOException {
|
||||
if (!afterSpace) {
|
||||
if (inTitle)
|
||||
title.append(" ");
|
||||
else
|
||||
addToSummary(" ");
|
||||
|
||||
String space = afterTag ? eol : " ";
|
||||
length += space.length();
|
||||
pipeOut.write(space);
|
||||
afterSpace = true;
|
||||
}
|
||||
}
|
||||
|
||||
final public void HTMLDocument() throws ParseException, IOException {
|
||||
Token t;
|
||||
label_1:
|
||||
while (true) {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ScriptStart:
|
||||
case TagName:
|
||||
case DeclName:
|
||||
case Comment1:
|
||||
case Comment2:
|
||||
case Word:
|
||||
case Entity:
|
||||
case Space:
|
||||
case Punct:
|
||||
;
|
||||
break;
|
||||
default:
|
||||
jj_la1[0] = jj_gen;
|
||||
break label_1;
|
||||
}
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case TagName:
|
||||
Tag();
|
||||
afterTag = true;
|
||||
break;
|
||||
case DeclName:
|
||||
t = Decl();
|
||||
afterTag = true;
|
||||
break;
|
||||
case Comment1:
|
||||
case Comment2:
|
||||
CommentTag();
|
||||
afterTag = true;
|
||||
break;
|
||||
case ScriptStart:
|
||||
ScriptTag();
|
||||
afterTag = true;
|
||||
break;
|
||||
case Word:
|
||||
t = jj_consume_token(Word);
|
||||
addText(t.image); afterTag = false;
|
||||
break;
|
||||
case Entity:
|
||||
t = jj_consume_token(Entity);
|
||||
addText(Entities.decode(t.image)); afterTag = false;
|
||||
break;
|
||||
case Punct:
|
||||
t = jj_consume_token(Punct);
|
||||
addText(t.image); afterTag = false;
|
||||
break;
|
||||
case Space:
|
||||
jj_consume_token(Space);
|
||||
addSpace(); afterTag = false;
|
||||
break;
|
||||
default:
|
||||
jj_la1[1] = jj_gen;
|
||||
jj_consume_token(-1);
|
||||
throw new ParseException();
|
||||
}
|
||||
}
|
||||
jj_consume_token(0);
|
||||
}
|
||||
|
||||
final public void Tag() throws ParseException, IOException {
|
||||
Token t1, t2;
|
||||
boolean inImg = false;
|
||||
t1 = jj_consume_token(TagName);
|
||||
String tagName = t1.image.toLowerCase(Locale.ROOT);
|
||||
if(Tags.WS_ELEMS.contains(tagName) ) {
|
||||
addSpace();
|
||||
}
|
||||
inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
|
||||
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
|
||||
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
|
||||
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
|
||||
|
||||
label_2:
|
||||
while (true) {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgName:
|
||||
;
|
||||
break;
|
||||
default:
|
||||
jj_la1[2] = jj_gen;
|
||||
break label_2;
|
||||
}
|
||||
t1 = jj_consume_token(ArgName);
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgEquals:
|
||||
jj_consume_token(ArgEquals);
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgValue:
|
||||
case ArgQuote1:
|
||||
case ArgQuote2:
|
||||
t2 = ArgValue();
|
||||
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
|
||||
addText("[" + t2.image + "]");
|
||||
|
||||
if(inMetaTag &&
|
||||
( t1.image.equalsIgnoreCase("name") ||
|
||||
t1.image.equalsIgnoreCase("HTTP-EQUIV")
|
||||
)
|
||||
&& t2 != null)
|
||||
{
|
||||
currentMetaTag=t2.image.toLowerCase(Locale.ROOT);
|
||||
if(currentMetaTag != null && currentMetaContent != null) {
|
||||
addMetaTag();
|
||||
}
|
||||
}
|
||||
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
|
||||
null)
|
||||
{
|
||||
currentMetaContent=t2.image.toLowerCase(Locale.ROOT);
|
||||
if(currentMetaTag != null && currentMetaContent != null) {
|
||||
addMetaTag();
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
jj_la1[3] = jj_gen;
|
||||
;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
jj_la1[4] = jj_gen;
|
||||
;
|
||||
}
|
||||
}
|
||||
jj_consume_token(TagEnd);
|
||||
}
|
||||
|
||||
final public Token ArgValue() throws ParseException {
|
||||
Token t = null;
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgValue:
|
||||
t = jj_consume_token(ArgValue);
|
||||
{if (true) return t;}
|
||||
break;
|
||||
default:
|
||||
jj_la1[5] = jj_gen;
|
||||
if (jj_2_1(2)) {
|
||||
jj_consume_token(ArgQuote1);
|
||||
jj_consume_token(CloseQuote1);
|
||||
{if (true) return t;}
|
||||
} else {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgQuote1:
|
||||
jj_consume_token(ArgQuote1);
|
||||
t = jj_consume_token(Quote1Text);
|
||||
jj_consume_token(CloseQuote1);
|
||||
{if (true) return t;}
|
||||
break;
|
||||
default:
|
||||
jj_la1[6] = jj_gen;
|
||||
if (jj_2_2(2)) {
|
||||
jj_consume_token(ArgQuote2);
|
||||
jj_consume_token(CloseQuote2);
|
||||
{if (true) return t;}
|
||||
} else {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgQuote2:
|
||||
jj_consume_token(ArgQuote2);
|
||||
t = jj_consume_token(Quote2Text);
|
||||
jj_consume_token(CloseQuote2);
|
||||
{if (true) return t;}
|
||||
break;
|
||||
default:
|
||||
jj_la1[7] = jj_gen;
|
||||
jj_consume_token(-1);
|
||||
throw new ParseException();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
throw new Error("Missing return statement in function");
|
||||
}
|
||||
|
||||
final public Token Decl() throws ParseException {
|
||||
Token t;
|
||||
t = jj_consume_token(DeclName);
|
||||
label_3:
|
||||
while (true) {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgName:
|
||||
case ArgEquals:
|
||||
case ArgValue:
|
||||
case ArgQuote1:
|
||||
case ArgQuote2:
|
||||
;
|
||||
break;
|
||||
default:
|
||||
jj_la1[8] = jj_gen;
|
||||
break label_3;
|
||||
}
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ArgName:
|
||||
jj_consume_token(ArgName);
|
||||
break;
|
||||
case ArgValue:
|
||||
case ArgQuote1:
|
||||
case ArgQuote2:
|
||||
ArgValue();
|
||||
break;
|
||||
case ArgEquals:
|
||||
jj_consume_token(ArgEquals);
|
||||
break;
|
||||
default:
|
||||
jj_la1[9] = jj_gen;
|
||||
jj_consume_token(-1);
|
||||
throw new ParseException();
|
||||
}
|
||||
}
|
||||
jj_consume_token(TagEnd);
|
||||
{if (true) return t;}
|
||||
throw new Error("Missing return statement in function");
|
||||
}
|
||||
|
||||
final public void CommentTag() throws ParseException {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case Comment1:
|
||||
jj_consume_token(Comment1);
|
||||
label_4:
|
||||
while (true) {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case CommentText1:
|
||||
;
|
||||
break;
|
||||
default:
|
||||
jj_la1[10] = jj_gen;
|
||||
break label_4;
|
||||
}
|
||||
jj_consume_token(CommentText1);
|
||||
}
|
||||
jj_consume_token(CommentEnd1);
|
||||
break;
|
||||
case Comment2:
|
||||
jj_consume_token(Comment2);
|
||||
label_5:
|
||||
while (true) {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case CommentText2:
|
||||
;
|
||||
break;
|
||||
default:
|
||||
jj_la1[11] = jj_gen;
|
||||
break label_5;
|
||||
}
|
||||
jj_consume_token(CommentText2);
|
||||
}
|
||||
jj_consume_token(CommentEnd2);
|
||||
break;
|
||||
default:
|
||||
jj_la1[12] = jj_gen;
|
||||
jj_consume_token(-1);
|
||||
throw new ParseException();
|
||||
}
|
||||
}
|
||||
|
||||
final public void ScriptTag() throws ParseException {
|
||||
jj_consume_token(ScriptStart);
|
||||
label_6:
|
||||
while (true) {
|
||||
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
|
||||
case ScriptText:
|
||||
;
|
||||
break;
|
||||
default:
|
||||
jj_la1[13] = jj_gen;
|
||||
break label_6;
|
||||
}
|
||||
jj_consume_token(ScriptText);
|
||||
}
|
||||
jj_consume_token(ScriptEnd);
|
||||
}
|
||||
|
||||
private boolean jj_2_1(int xla) {
|
||||
jj_la = xla; jj_lastpos = jj_scanpos = token;
|
||||
try { return !jj_3_1(); }
|
||||
catch(LookaheadSuccess ls) { return true; }
|
||||
finally { jj_save(0, xla); }
|
||||
}
|
||||
|
||||
private boolean jj_2_2(int xla) {
|
||||
jj_la = xla; jj_lastpos = jj_scanpos = token;
|
||||
try { return !jj_3_2(); }
|
||||
catch(LookaheadSuccess ls) { return true; }
|
||||
finally { jj_save(1, xla); }
|
||||
}
|
||||
|
||||
private boolean jj_3_2() {
|
||||
if (jj_scan_token(ArgQuote2)) return true;
|
||||
if (jj_scan_token(CloseQuote2)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean jj_3_1() {
|
||||
if (jj_scan_token(ArgQuote1)) return true;
|
||||
if (jj_scan_token(CloseQuote1)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Generated Token Manager. */
|
||||
public HTMLParserTokenManager token_source;
|
||||
/** Current token. */
|
||||
public Token token;
|
||||
/** Next token. */
|
||||
public Token jj_nt;
|
||||
private int jj_ntk;
|
||||
private Token jj_scanpos, jj_lastpos;
|
||||
private int jj_la;
|
||||
private int jj_gen;
|
||||
final private int[] jj_la1 = new int[14];
|
||||
static private int[] jj_la1_0;
|
||||
static {
|
||||
jj_la1_init_0();
|
||||
}
|
||||
private static void jj_la1_init_0() {
|
||||
jj_la1_0 = new int[] {0x2c7e,0x2c7e,0x10000,0x380000,0x20000,0x80000,0x100000,0x200000,0x3b0000,0x3b0000,0x8000000,0x20000000,0x30,0x4000,};
|
||||
}
|
||||
final private JJCalls[] jj_2_rtns = new JJCalls[2];
|
||||
private boolean jj_rescan = false;
|
||||
private int jj_gc = 0;
|
||||
|
||||
/** Constructor with user supplied CharStream. */
|
||||
public HTMLParser(CharStream stream) {
|
||||
token_source = new HTMLParserTokenManager(stream);
|
||||
token = new Token();
|
||||
jj_ntk = -1;
|
||||
jj_gen = 0;
|
||||
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
|
||||
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
|
||||
}
|
||||
|
||||
/** Reinitialise. */
|
||||
public void ReInit(CharStream stream) {
|
||||
token_source.ReInit(stream);
|
||||
token = new Token();
|
||||
jj_ntk = -1;
|
||||
jj_gen = 0;
|
||||
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
|
||||
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
|
||||
}
|
||||
|
||||
/** Constructor with generated Token Manager. */
|
||||
public HTMLParser(HTMLParserTokenManager tm) {
|
||||
token_source = tm;
|
||||
token = new Token();
|
||||
jj_ntk = -1;
|
||||
jj_gen = 0;
|
||||
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
|
||||
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
|
||||
}
|
||||
|
||||
/** Reinitialise. */
|
||||
public void ReInit(HTMLParserTokenManager tm) {
|
||||
token_source = tm;
|
||||
token = new Token();
|
||||
jj_ntk = -1;
|
||||
jj_gen = 0;
|
||||
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
|
||||
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
|
||||
}
|
||||
|
||||
private Token jj_consume_token(int kind) throws ParseException {
|
||||
Token oldToken;
|
||||
if ((oldToken = token).next != null) token = token.next;
|
||||
else token = token.next = token_source.getNextToken();
|
||||
jj_ntk = -1;
|
||||
if (token.kind == kind) {
|
||||
jj_gen++;
|
||||
if (++jj_gc > 100) {
|
||||
jj_gc = 0;
|
||||
for (int i = 0; i < jj_2_rtns.length; i++) {
|
||||
JJCalls c = jj_2_rtns[i];
|
||||
while (c != null) {
|
||||
if (c.gen < jj_gen) c.first = null;
|
||||
c = c.next;
|
||||
}
|
||||
}
|
||||
}
|
||||
return token;
|
||||
}
|
||||
token = oldToken;
|
||||
jj_kind = kind;
|
||||
throw generateParseException();
|
||||
}
|
||||
|
||||
static private final class LookaheadSuccess extends java.lang.Error { }
|
||||
final private LookaheadSuccess jj_ls = new LookaheadSuccess();
|
||||
private boolean jj_scan_token(int kind) {
|
||||
if (jj_scanpos == jj_lastpos) {
|
||||
jj_la--;
|
||||
if (jj_scanpos.next == null) {
|
||||
jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken();
|
||||
} else {
|
||||
jj_lastpos = jj_scanpos = jj_scanpos.next;
|
||||
}
|
||||
} else {
|
||||
jj_scanpos = jj_scanpos.next;
|
||||
}
|
||||
if (jj_rescan) {
|
||||
int i = 0; Token tok = token;
|
||||
while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; }
|
||||
if (tok != null) jj_add_error_token(kind, i);
|
||||
}
|
||||
if (jj_scanpos.kind != kind) return true;
|
||||
if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/** Get the next Token. */
|
||||
final public Token getNextToken() {
|
||||
if (token.next != null) token = token.next;
|
||||
else token = token.next = token_source.getNextToken();
|
||||
jj_ntk = -1;
|
||||
jj_gen++;
|
||||
return token;
|
||||
}
|
||||
|
||||
/** Get the specific Token. */
|
||||
final public Token getToken(int index) {
|
||||
Token t = token;
|
||||
for (int i = 0; i < index; i++) {
|
||||
if (t.next != null) t = t.next;
|
||||
else t = t.next = token_source.getNextToken();
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
private int jj_ntk() {
|
||||
if ((jj_nt=token.next) == null)
|
||||
return (jj_ntk = (token.next=token_source.getNextToken()).kind);
|
||||
else
|
||||
return (jj_ntk = jj_nt.kind);
|
||||
}
|
||||
|
||||
private java.util.List jj_expentries = new java.util.ArrayList();
|
||||
private int[] jj_expentry;
|
||||
private int jj_kind = -1;
|
||||
private int[] jj_lasttokens = new int[100];
|
||||
private int jj_endpos;
|
||||
|
||||
private void jj_add_error_token(int kind, int pos) {
|
||||
if (pos >= 100) return;
|
||||
if (pos == jj_endpos + 1) {
|
||||
jj_lasttokens[jj_endpos++] = kind;
|
||||
} else if (jj_endpos != 0) {
|
||||
jj_expentry = new int[jj_endpos];
|
||||
for (int i = 0; i < jj_endpos; i++) {
|
||||
jj_expentry[i] = jj_lasttokens[i];
|
||||
}
|
||||
jj_entries_loop: for (java.util.Iterator it = jj_expentries.iterator(); it.hasNext();) {
|
||||
int[] oldentry = (int[])(it.next());
|
||||
if (oldentry.length == jj_expentry.length) {
|
||||
for (int i = 0; i < jj_expentry.length; i++) {
|
||||
if (oldentry[i] != jj_expentry[i]) {
|
||||
continue jj_entries_loop;
|
||||
}
|
||||
}
|
||||
jj_expentries.add(jj_expentry);
|
||||
break jj_entries_loop;
|
||||
}
|
||||
}
|
||||
if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind;
|
||||
}
|
||||
}
|
||||
|
||||
/** Generate ParseException. */
|
||||
public ParseException generateParseException() {
|
||||
jj_expentries.clear();
|
||||
boolean[] la1tokens = new boolean[31];
|
||||
if (jj_kind >= 0) {
|
||||
la1tokens[jj_kind] = true;
|
||||
jj_kind = -1;
|
||||
}
|
||||
for (int i = 0; i < 14; i++) {
|
||||
if (jj_la1[i] == jj_gen) {
|
||||
for (int j = 0; j < 32; j++) {
|
||||
if ((jj_la1_0[i] & (1<<j)) != 0) {
|
||||
la1tokens[j] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 31; i++) {
|
||||
if (la1tokens[i]) {
|
||||
jj_expentry = new int[1];
|
||||
jj_expentry[0] = i;
|
||||
jj_expentries.add(jj_expentry);
|
||||
}
|
||||
}
|
||||
jj_endpos = 0;
|
||||
jj_rescan_token();
|
||||
jj_add_error_token(0, 0);
|
||||
int[][] exptokseq = new int[jj_expentries.size()][];
|
||||
for (int i = 0; i < jj_expentries.size(); i++) {
|
||||
exptokseq[i] = (int[])jj_expentries.get(i);
|
||||
}
|
||||
return new ParseException(token, exptokseq, tokenImage);
|
||||
}
|
||||
|
||||
/** Enable tracing. */
|
||||
final public void enable_tracing() {
|
||||
}
|
||||
|
||||
/** Disable tracing. */
|
||||
final public void disable_tracing() {
|
||||
}
|
||||
|
||||
private void jj_rescan_token() {
|
||||
jj_rescan = true;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
try {
|
||||
JJCalls p = jj_2_rtns[i];
|
||||
do {
|
||||
if (p.gen > jj_gen) {
|
||||
jj_la = p.arg; jj_lastpos = jj_scanpos = p.first;
|
||||
switch (i) {
|
||||
case 0: jj_3_1(); break;
|
||||
case 1: jj_3_2(); break;
|
||||
}
|
||||
}
|
||||
p = p.next;
|
||||
} while (p != null);
|
||||
} catch(LookaheadSuccess ls) { }
|
||||
}
|
||||
jj_rescan = false;
|
||||
}
|
||||
|
||||
private void jj_save(int index, int xla) {
|
||||
JJCalls p = jj_2_rtns[index];
|
||||
while (p.gen > jj_gen) {
|
||||
if (p.next == null) { p = p.next = new JJCalls(); break; }
|
||||
p = p.next;
|
||||
}
|
||||
p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla;
|
||||
}
|
||||
|
||||
static final class JJCalls {
|
||||
int gen;
|
||||
Token first;
|
||||
int arg;
|
||||
JJCalls next;
|
||||
}
|
||||
|
||||
// void handleException(Exception e) {
|
||||
// System.out.println(e.toString()); // print the error message
|
||||
// System.out.println("Skipping...");
|
||||
// Token t;
|
||||
// do {
|
||||
// t = getNextToken();
|
||||
// } while (t.kind != TagEnd);
|
||||
// }
|
||||
}
|
@ -1,394 +0,0 @@
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// HTMLParser.jj
|
||||
|
||||
options {
|
||||
STATIC = false;
|
||||
//DEBUG_LOOKAHEAD = true;
|
||||
//DEBUG_TOKEN_MANAGER = true;
|
||||
UNICODE_INPUT = true;
|
||||
USER_CHAR_STREAM=true;
|
||||
}
|
||||
|
||||
PARSER_BEGIN(HTMLParser)
|
||||
|
||||
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Locale;
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* Basic html parser (for demo/testing purposes only!)
|
||||
*/
|
||||
public class HTMLParser {
|
||||
public static int SUMMARY_LENGTH = 200;
|
||||
|
||||
StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
|
||||
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
|
||||
Properties metaTags=new Properties();
|
||||
String currentMetaTag=null;
|
||||
String currentMetaContent=null;
|
||||
int length = 0;
|
||||
boolean titleComplete = false;
|
||||
boolean inTitle = false;
|
||||
boolean inMetaTag = false;
|
||||
boolean inStyle = false;
|
||||
boolean afterTag = false;
|
||||
boolean afterSpace = false;
|
||||
String eol = System.getProperty("line.separator");
|
||||
Reader pipeIn = null;
|
||||
Writer pipeOut;
|
||||
private MyPipedInputStream pipeInStream = null;
|
||||
private PipedOutputStream pipeOutStream = null;
|
||||
|
||||
public HTMLParser(Reader reader) {
|
||||
this(new FastCharStream(reader));
|
||||
}
|
||||
|
||||
private class MyPipedInputStream extends PipedInputStream{
|
||||
|
||||
public MyPipedInputStream(){
|
||||
super();
|
||||
}
|
||||
|
||||
public MyPipedInputStream(PipedOutputStream src) throws IOException{
|
||||
super(src);
|
||||
}
|
||||
|
||||
public boolean full() throws IOException{
|
||||
return this.available() >= PipedInputStream.PIPE_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
public String getTitle() throws IOException, InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
while (true) {
|
||||
synchronized(this) {
|
||||
if (titleComplete || pipeInStream.full())
|
||||
break;
|
||||
wait(10);
|
||||
}
|
||||
}
|
||||
return title.toString().trim();
|
||||
}
|
||||
|
||||
public Properties getMetaTags() throws IOException,
|
||||
InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
while (true) {
|
||||
synchronized(this) {
|
||||
if (titleComplete || pipeInStream.full())
|
||||
break;
|
||||
wait(10);
|
||||
}
|
||||
}
|
||||
return metaTags;
|
||||
}
|
||||
|
||||
|
||||
public String getSummary() throws IOException, InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
while (true) {
|
||||
synchronized(this) {
|
||||
if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
|
||||
break;
|
||||
wait(10);
|
||||
}
|
||||
}
|
||||
if (summary.length() > SUMMARY_LENGTH)
|
||||
summary.setLength(SUMMARY_LENGTH);
|
||||
|
||||
String sum = summary.toString().trim();
|
||||
String tit = getTitle();
|
||||
if (sum.equals(""))
|
||||
return tit;
|
||||
else
|
||||
return sum;
|
||||
}
|
||||
|
||||
public Reader getReader() throws IOException {
|
||||
if (pipeIn == null) {
|
||||
pipeInStream = new MyPipedInputStream();
|
||||
pipeOutStream = new PipedOutputStream(pipeInStream);
|
||||
pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
|
||||
pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
|
||||
|
||||
Thread thread = new ParserThread(this);
|
||||
thread.start(); // start parsing
|
||||
}
|
||||
|
||||
return pipeIn;
|
||||
}
|
||||
|
||||
void addToSummary(String text) {
|
||||
if (summary.length() < SUMMARY_LENGTH) {
|
||||
summary.append(text);
|
||||
if (summary.length() >= SUMMARY_LENGTH) {
|
||||
synchronized(this) {
|
||||
notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void addText(String text) throws IOException {
|
||||
if (inStyle)
|
||||
return;
|
||||
if (inTitle)
|
||||
title.append(text);
|
||||
else {
|
||||
addToSummary(text);
|
||||
if (!titleComplete && !(title.length() == 0)) { // finished title
|
||||
synchronized(this) {
|
||||
titleComplete = true; // tell waiting threads
|
||||
notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
length += text.length();
|
||||
pipeOut.write(text);
|
||||
|
||||
afterSpace = false;
|
||||
}
|
||||
|
||||
void addMetaTag() {
|
||||
metaTags.setProperty(currentMetaTag, currentMetaContent);
|
||||
currentMetaTag = null;
|
||||
currentMetaContent = null;
|
||||
return;
|
||||
}
|
||||
|
||||
void addSpace() throws IOException {
|
||||
if (!afterSpace) {
|
||||
if (inTitle)
|
||||
title.append(" ");
|
||||
else
|
||||
addToSummary(" ");
|
||||
|
||||
String space = afterTag ? eol : " ";
|
||||
length += space.length();
|
||||
pipeOut.write(space);
|
||||
afterSpace = true;
|
||||
}
|
||||
}
|
||||
|
||||
// void handleException(Exception e) {
|
||||
// System.out.println(e.toString()); // print the error message
|
||||
// System.out.println("Skipping...");
|
||||
// Token t;
|
||||
// do {
|
||||
// t = getNextToken();
|
||||
// } while (t.kind != TagEnd);
|
||||
// }
|
||||
}
|
||||
|
||||
PARSER_END(HTMLParser)
|
||||
|
||||
|
||||
void HTMLDocument() throws IOException :
|
||||
{
|
||||
Token t;
|
||||
}
|
||||
{
|
||||
// try {
|
||||
( Tag() { afterTag = true; }
|
||||
| t=Decl() { afterTag = true; }
|
||||
| CommentTag() { afterTag = true; }
|
||||
| ScriptTag() { afterTag = true; }
|
||||
| t=<Word> { addText(t.image); afterTag = false; }
|
||||
| t=<Entity> { addText(Entities.decode(t.image)); afterTag = false; }
|
||||
| t=<Punct> { addText(t.image); afterTag = false; }
|
||||
| <Space> { addSpace(); afterTag = false; }
|
||||
)* <EOF>
|
||||
// } catch (ParseException e) {
|
||||
// handleException(e);
|
||||
// }
|
||||
}
|
||||
|
||||
void Tag() throws IOException :
|
||||
{
|
||||
Token t1, t2;
|
||||
boolean inImg = false;
|
||||
}
|
||||
{
|
||||
t1=<TagName> {
|
||||
String tagName = t1.image.toLowerCase(Locale.ROOT);
|
||||
if(Tags.WS_ELEMS.contains(tagName) ) {
|
||||
addSpace();
|
||||
}
|
||||
inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
|
||||
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
|
||||
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
|
||||
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
|
||||
}
|
||||
(t1=<ArgName>
|
||||
(<ArgEquals>
|
||||
(t2=ArgValue() // save ALT text in IMG tag
|
||||
{
|
||||
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
|
||||
addText("[" + t2.image + "]");
|
||||
|
||||
if(inMetaTag &&
|
||||
( t1.image.equalsIgnoreCase("name") ||
|
||||
t1.image.equalsIgnoreCase("HTTP-EQUIV")
|
||||
)
|
||||
&& t2 != null)
|
||||
{
|
||||
currentMetaTag=t2.image.toLowerCase(Locale.ROOT);
|
||||
if(currentMetaTag != null && currentMetaContent != null) {
|
||||
addMetaTag();
|
||||
}
|
||||
}
|
||||
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
|
||||
null)
|
||||
{
|
||||
currentMetaContent=t2.image.toLowerCase(Locale.ROOT);
|
||||
if(currentMetaTag != null && currentMetaContent != null) {
|
||||
addMetaTag();
|
||||
}
|
||||
}
|
||||
}
|
||||
)?
|
||||
)?
|
||||
)*
|
||||
<TagEnd>
|
||||
}
|
||||
|
||||
Token ArgValue() :
|
||||
{
|
||||
Token t = null;
|
||||
}
|
||||
{
|
||||
t=<ArgValue> { return t; }
|
||||
| LOOKAHEAD(2)
|
||||
<ArgQuote1> <CloseQuote1> { return t; }
|
||||
| <ArgQuote1> t=<Quote1Text> <CloseQuote1> { return t; }
|
||||
| LOOKAHEAD(2)
|
||||
<ArgQuote2> <CloseQuote2> { return t; }
|
||||
| <ArgQuote2> t=<Quote2Text> <CloseQuote2> { return t; }
|
||||
}
|
||||
|
||||
|
||||
Token Decl() :
|
||||
{
|
||||
Token t;
|
||||
}
|
||||
{
|
||||
t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd>
|
||||
{ return t; }
|
||||
}
|
||||
|
||||
|
||||
void CommentTag() :
|
||||
{}
|
||||
{
|
||||
(<Comment1> ( <CommentText1> )* <CommentEnd1>)
|
||||
|
|
||||
(<Comment2> ( <CommentText2> )* <CommentEnd2>)
|
||||
}
|
||||
|
||||
void ScriptTag() :
|
||||
{}
|
||||
{
|
||||
<ScriptStart> ( <ScriptText> )* <ScriptEnd>
|
||||
}
|
||||
|
||||
|
||||
TOKEN :
|
||||
{
|
||||
< ScriptStart: "<script" > : WithinScript
|
||||
| < TagName: "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
|
||||
| < DeclName: "<" "!" ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
|
||||
|
||||
| < Comment1: "<!--" > : WithinComment1
|
||||
| < Comment2: "<!" > : WithinComment2
|
||||
|
||||
| < Word: ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] |
|
||||
<LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM> )+ >
|
||||
| < #LET: ["A"-"Z","a"-"z","0"-"9"] >
|
||||
| < #NUM: ["0"-"9"] >
|
||||
| < #HEX: ["0"-"9","A"-"F","a"-"f"] >
|
||||
|
||||
| < Entity: ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+ (";")? | "&" "#" ["X","x"] (<HEX>)+ (";")? ) >
|
||||
|
||||
| < Space: (<SP>)+ >
|
||||
| < #SP: [" ","\t","\r","\n"] >
|
||||
|
||||
| < Punct: ~[] > // Keep this last. It is a catch-all.
|
||||
}
|
||||
|
||||
<WithinScript> TOKEN:
|
||||
{
|
||||
< ScriptText: (~["<",">"])+ | "<" | ">" >
|
||||
| < ScriptEnd: "</script" (~["<",">"])* ">" > : DEFAULT
|
||||
}
|
||||
|
||||
<WithinTag> TOKEN:
|
||||
{
|
||||
< ArgName: (~[" ","\t","\r","\n","=",">","'","\""])
|
||||
(~[" ","\t","\r","\n","=",">"])* >
|
||||
| < ArgEquals: "=" > : AfterEquals
|
||||
| < TagEnd: ">" | "=>" > : DEFAULT
|
||||
}
|
||||
|
||||
<AfterEquals> TOKEN:
|
||||
{
|
||||
< ArgValue: (~[" ","\t","\r","\n","=",">","'","\""])
|
||||
(~[" ","\t","\r","\n",">"])* > : WithinTag
|
||||
}
|
||||
|
||||
<WithinTag, AfterEquals> TOKEN:
|
||||
{
|
||||
< ArgQuote1: "'" > : WithinQuote1
|
||||
| < ArgQuote2: "\"" > : WithinQuote2
|
||||
}
|
||||
|
||||
<WithinTag, AfterEquals> SKIP:
|
||||
{
|
||||
< <Space> >
|
||||
}
|
||||
|
||||
<WithinQuote1> TOKEN:
|
||||
{
|
||||
< Quote1Text: (~["'"])+ >
|
||||
| < CloseQuote1: <ArgQuote1> > : WithinTag
|
||||
}
|
||||
|
||||
<WithinQuote2> TOKEN:
|
||||
{
|
||||
< Quote2Text: (~["\""])+ >
|
||||
| < CloseQuote2: <ArgQuote2> > : WithinTag
|
||||
}
|
||||
|
||||
|
||||
<WithinComment1> TOKEN :
|
||||
{
|
||||
< CommentText1: (~["-"])+ | "-" >
|
||||
| < CommentEnd1: "-->" > : DEFAULT
|
||||
}
|
||||
|
||||
<WithinComment2> TOKEN :
|
||||
{
|
||||
< CommentText2: (~[">"])+ >
|
||||
| < CommentEnd2: ">" > : DEFAULT
|
||||
}
|
@ -1,124 +0,0 @@
|
||||
/* Generated By:JavaCC: Do not edit this line. HTMLParserConstants.java */
|
||||
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
||||
|
||||
|
||||
/**
|
||||
* Token literal values and constants.
|
||||
* Generated by org.javacc.parser.OtherFilesGen#start()
|
||||
*/
|
||||
public interface HTMLParserConstants {
|
||||
|
||||
/** End of File. */
|
||||
int EOF = 0;
|
||||
/** RegularExpression Id. */
|
||||
int ScriptStart = 1;
|
||||
/** RegularExpression Id. */
|
||||
int TagName = 2;
|
||||
/** RegularExpression Id. */
|
||||
int DeclName = 3;
|
||||
/** RegularExpression Id. */
|
||||
int Comment1 = 4;
|
||||
/** RegularExpression Id. */
|
||||
int Comment2 = 5;
|
||||
/** RegularExpression Id. */
|
||||
int Word = 6;
|
||||
/** RegularExpression Id. */
|
||||
int LET = 7;
|
||||
/** RegularExpression Id. */
|
||||
int NUM = 8;
|
||||
/** RegularExpression Id. */
|
||||
int HEX = 9;
|
||||
/** RegularExpression Id. */
|
||||
int Entity = 10;
|
||||
/** RegularExpression Id. */
|
||||
int Space = 11;
|
||||
/** RegularExpression Id. */
|
||||
int SP = 12;
|
||||
/** RegularExpression Id. */
|
||||
int Punct = 13;
|
||||
/** RegularExpression Id. */
|
||||
int ScriptText = 14;
|
||||
/** RegularExpression Id. */
|
||||
int ScriptEnd = 15;
|
||||
/** RegularExpression Id. */
|
||||
int ArgName = 16;
|
||||
/** RegularExpression Id. */
|
||||
int ArgEquals = 17;
|
||||
/** RegularExpression Id. */
|
||||
int TagEnd = 18;
|
||||
/** RegularExpression Id. */
|
||||
int ArgValue = 19;
|
||||
/** RegularExpression Id. */
|
||||
int ArgQuote1 = 20;
|
||||
/** RegularExpression Id. */
|
||||
int ArgQuote2 = 21;
|
||||
/** RegularExpression Id. */
|
||||
int Quote1Text = 23;
|
||||
/** RegularExpression Id. */
|
||||
int CloseQuote1 = 24;
|
||||
/** RegularExpression Id. */
|
||||
int Quote2Text = 25;
|
||||
/** RegularExpression Id. */
|
||||
int CloseQuote2 = 26;
|
||||
/** RegularExpression Id. */
|
||||
int CommentText1 = 27;
|
||||
/** RegularExpression Id. */
|
||||
int CommentEnd1 = 28;
|
||||
/** RegularExpression Id. */
|
||||
int CommentText2 = 29;
|
||||
/** RegularExpression Id. */
|
||||
int CommentEnd2 = 30;
|
||||
|
||||
/** Lexical state. */
|
||||
int DEFAULT = 0;
|
||||
/** Lexical state. */
|
||||
int WithinScript = 1;
|
||||
/** Lexical state. */
|
||||
int WithinTag = 2;
|
||||
/** Lexical state. */
|
||||
int AfterEquals = 3;
|
||||
/** Lexical state. */
|
||||
int WithinQuote1 = 4;
|
||||
/** Lexical state. */
|
||||
int WithinQuote2 = 5;
|
||||
/** Lexical state. */
|
||||
int WithinComment1 = 6;
|
||||
/** Lexical state. */
|
||||
int WithinComment2 = 7;
|
||||
|
||||
/** Literal token values. */
|
||||
String[] tokenImage = {
|
||||
"<EOF>",
|
||||
"\"<script\"",
|
||||
"<TagName>",
|
||||
"<DeclName>",
|
||||
"\"<!--\"",
|
||||
"\"<!\"",
|
||||
"<Word>",
|
||||
"<LET>",
|
||||
"<NUM>",
|
||||
"<HEX>",
|
||||
"<Entity>",
|
||||
"<Space>",
|
||||
"<SP>",
|
||||
"<Punct>",
|
||||
"<ScriptText>",
|
||||
"<ScriptEnd>",
|
||||
"<ArgName>",
|
||||
"\"=\"",
|
||||
"<TagEnd>",
|
||||
"<ArgValue>",
|
||||
"\"\\\'\"",
|
||||
"\"\\\"\"",
|
||||
"<token of kind 22>",
|
||||
"<Quote1Text>",
|
||||
"<CloseQuote1>",
|
||||
"<Quote2Text>",
|
||||
"<CloseQuote2>",
|
||||
"<CommentText1>",
|
||||
"\"-->\"",
|
||||
"<CommentText2>",
|
||||
"\">\"",
|
||||
};
|
||||
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,198 +0,0 @@
|
||||
/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 4.1 */
|
||||
/* JavaCCOptions:KEEP_LINE_COL=null */
|
||||
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
||||
|
||||
/**
|
||||
* This exception is thrown when parse errors are encountered.
|
||||
* You can explicitly create objects of this exception type by
|
||||
* calling the method generateParseException in the generated
|
||||
* parser.
|
||||
*
|
||||
* You can modify this class to customize your error reporting
|
||||
* mechanisms so long as you retain the public fields.
|
||||
*/
|
||||
public class ParseException extends Exception {
|
||||
|
||||
/**
|
||||
* This constructor is used by the method "generateParseException"
|
||||
* in the generated parser. Calling this constructor generates
|
||||
* a new object of this type with the fields "currentToken",
|
||||
* "expectedTokenSequences", and "tokenImage" set. The boolean
|
||||
* flag "specialConstructor" is also set to true to indicate that
|
||||
* this constructor was used to create this object.
|
||||
* This constructor calls its super class with the empty string
|
||||
* to force the "toString" method of parent class "Throwable" to
|
||||
* print the error message in the form:
|
||||
* ParseException: <result of getMessage>
|
||||
*/
|
||||
public ParseException(Token currentTokenVal,
|
||||
int[][] expectedTokenSequencesVal,
|
||||
String[] tokenImageVal
|
||||
)
|
||||
{
|
||||
super("");
|
||||
specialConstructor = true;
|
||||
currentToken = currentTokenVal;
|
||||
expectedTokenSequences = expectedTokenSequencesVal;
|
||||
tokenImage = tokenImageVal;
|
||||
}
|
||||
|
||||
/**
|
||||
* The following constructors are for use by you for whatever
|
||||
* purpose you can think of. Constructing the exception in this
|
||||
* manner makes the exception behave in the normal way - i.e., as
|
||||
* documented in the class "Throwable". The fields "errorToken",
|
||||
* "expectedTokenSequences", and "tokenImage" do not contain
|
||||
* relevant information. The JavaCC generated code does not use
|
||||
* these constructors.
|
||||
*/
|
||||
|
||||
public ParseException() {
|
||||
super();
|
||||
specialConstructor = false;
|
||||
}
|
||||
|
||||
/** Constructor with message. */
|
||||
public ParseException(String message) {
|
||||
super(message);
|
||||
specialConstructor = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* This variable determines which constructor was used to create
|
||||
* this object and thereby affects the semantics of the
|
||||
* "getMessage" method (see below).
|
||||
*/
|
||||
protected boolean specialConstructor;
|
||||
|
||||
/**
|
||||
* This is the last token that has been consumed successfully. If
|
||||
* this object has been created due to a parse error, the token
|
||||
* followng this token will (therefore) be the first error token.
|
||||
*/
|
||||
public Token currentToken;
|
||||
|
||||
/**
|
||||
* Each entry in this array is an array of integers. Each array
|
||||
* of integers represents a sequence of tokens (by their ordinal
|
||||
* values) that is expected at this point of the parse.
|
||||
*/
|
||||
public int[][] expectedTokenSequences;
|
||||
|
||||
/**
|
||||
* This is a reference to the "tokenImage" array of the generated
|
||||
* parser within which the parse error occurred. This array is
|
||||
* defined in the generated ...Constants interface.
|
||||
*/
|
||||
public String[] tokenImage;
|
||||
|
||||
/**
|
||||
* This method has the standard behavior when this object has been
|
||||
* created using the standard constructors. Otherwise, it uses
|
||||
* "currentToken" and "expectedTokenSequences" to generate a parse
|
||||
* error message and returns it. If this object has been created
|
||||
* due to a parse error, and you do not catch it (it gets thrown
|
||||
* from the parser), then this method is called during the printing
|
||||
* of the final stack trace, and hence the correct error message
|
||||
* gets displayed.
|
||||
*/
|
||||
public String getMessage() {
|
||||
if (!specialConstructor) {
|
||||
return super.getMessage();
|
||||
}
|
||||
StringBuffer expected = new StringBuffer();
|
||||
int maxSize = 0;
|
||||
for (int i = 0; i < expectedTokenSequences.length; i++) {
|
||||
if (maxSize < expectedTokenSequences[i].length) {
|
||||
maxSize = expectedTokenSequences[i].length;
|
||||
}
|
||||
for (int j = 0; j < expectedTokenSequences[i].length; j++) {
|
||||
expected.append(tokenImage[expectedTokenSequences[i][j]]).append(' ');
|
||||
}
|
||||
if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) {
|
||||
expected.append("...");
|
||||
}
|
||||
expected.append(eol).append(" ");
|
||||
}
|
||||
String retval = "Encountered \"";
|
||||
Token tok = currentToken.next;
|
||||
for (int i = 0; i < maxSize; i++) {
|
||||
if (i != 0) retval += " ";
|
||||
if (tok.kind == 0) {
|
||||
retval += tokenImage[0];
|
||||
break;
|
||||
}
|
||||
retval += " " + tokenImage[tok.kind];
|
||||
retval += " \"";
|
||||
retval += add_escapes(tok.image);
|
||||
retval += " \"";
|
||||
tok = tok.next;
|
||||
}
|
||||
retval += "\" at line " + currentToken.next.beginLine + ", column " + currentToken.next.beginColumn;
|
||||
retval += "." + eol;
|
||||
if (expectedTokenSequences.length == 1) {
|
||||
retval += "Was expecting:" + eol + " ";
|
||||
} else {
|
||||
retval += "Was expecting one of:" + eol + " ";
|
||||
}
|
||||
retval += expected.toString();
|
||||
return retval;
|
||||
}
|
||||
|
||||
/**
|
||||
* The end of line string for this machine.
|
||||
*/
|
||||
protected String eol = System.getProperty("line.separator", "\n");
|
||||
|
||||
/**
|
||||
* Used to convert raw characters to their escaped version
|
||||
* when these raw version cannot be used as part of an ASCII
|
||||
* string literal.
|
||||
*/
|
||||
protected String add_escapes(String str) {
|
||||
StringBuffer retval = new StringBuffer();
|
||||
char ch;
|
||||
for (int i = 0; i < str.length(); i++) {
|
||||
switch (str.charAt(i))
|
||||
{
|
||||
case 0 :
|
||||
continue;
|
||||
case '\b':
|
||||
retval.append("\\b");
|
||||
continue;
|
||||
case '\t':
|
||||
retval.append("\\t");
|
||||
continue;
|
||||
case '\n':
|
||||
retval.append("\\n");
|
||||
continue;
|
||||
case '\f':
|
||||
retval.append("\\f");
|
||||
continue;
|
||||
case '\r':
|
||||
retval.append("\\r");
|
||||
continue;
|
||||
case '\"':
|
||||
retval.append("\\\"");
|
||||
continue;
|
||||
case '\'':
|
||||
retval.append("\\\'");
|
||||
continue;
|
||||
case '\\':
|
||||
retval.append("\\\\");
|
||||
continue;
|
||||
default:
|
||||
if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
|
||||
String s = "0000" + Integer.toString(ch, 16);
|
||||
retval.append("\\u" + s.substring(s.length() - 4, s.length()));
|
||||
} else {
|
||||
retval.append(ch);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return retval.toString();
|
||||
}
|
||||
|
||||
}
|
||||
/* JavaCC - OriginalChecksum=e449d0e43f3d85deb1260a88b7e90fcd (do not edit this line) */
|
@ -1,50 +0,0 @@
|
||||
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.*;
|
||||
|
||||
class ParserThread extends Thread {
|
||||
HTMLParser parser;
|
||||
|
||||
ParserThread(HTMLParser p) {
|
||||
parser = p;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() { // convert pipeOut to pipeIn
|
||||
try {
|
||||
try { // parse document to pipeOut
|
||||
parser.HTMLDocument();
|
||||
} catch (ParseException e) {
|
||||
System.out.println("Parse Aborted: " + e.getMessage());
|
||||
} catch (TokenMgrError e) {
|
||||
System.out.println("Parse Aborted: " + e.getMessage());
|
||||
} finally {
|
||||
parser.pipeOut.close();
|
||||
synchronized (parser) {
|
||||
parser.summary.setLength(HTMLParser.SUMMARY_LENGTH);
|
||||
parser.titleComplete = true;
|
||||
parser.notifyAll();
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
@ -1,67 +0,0 @@
|
||||
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
/**
|
||||
* Utility class storing set of commonly-used html tags.
|
||||
*/
|
||||
public final class Tags {
|
||||
|
||||
/**
|
||||
* contains all tags for which whitespaces have to be inserted for proper tokenization
|
||||
*/
|
||||
public static final Set<String> WS_ELEMS;
|
||||
|
||||
static{
|
||||
WS_ELEMS = new HashSet<String>();
|
||||
WS_ELEMS.add("<hr");
|
||||
WS_ELEMS.add("<hr/"); // note that "<hr />" does not need to be listed explicitly
|
||||
WS_ELEMS.add("<br");
|
||||
WS_ELEMS.add("<br/");
|
||||
WS_ELEMS.add("<p");
|
||||
WS_ELEMS.add("</p");
|
||||
WS_ELEMS.add("<div");
|
||||
WS_ELEMS.add("</div");
|
||||
WS_ELEMS.add("<td");
|
||||
WS_ELEMS.add("</td");
|
||||
WS_ELEMS.add("<li");
|
||||
WS_ELEMS.add("</li");
|
||||
WS_ELEMS.add("<q");
|
||||
WS_ELEMS.add("</q");
|
||||
WS_ELEMS.add("<blockquote");
|
||||
WS_ELEMS.add("</blockquote");
|
||||
WS_ELEMS.add("<dt");
|
||||
WS_ELEMS.add("</dt");
|
||||
WS_ELEMS.add("<h1");
|
||||
WS_ELEMS.add("</h1");
|
||||
WS_ELEMS.add("<h2");
|
||||
WS_ELEMS.add("</h2");
|
||||
WS_ELEMS.add("<h3");
|
||||
WS_ELEMS.add("</h3");
|
||||
WS_ELEMS.add("<h4");
|
||||
WS_ELEMS.add("</h4");
|
||||
WS_ELEMS.add("<h5");
|
||||
WS_ELEMS.add("</h5");
|
||||
WS_ELEMS.add("<h6");
|
||||
WS_ELEMS.add("</h6");
|
||||
}
|
||||
}
|
@ -1,124 +0,0 @@
|
||||
/* Generated By:JavaCC: Do not edit this line. Token.java Version 4.1 */
|
||||
/* JavaCCOptions:TOKEN_EXTENDS=,KEEP_LINE_COL=null */
|
||||
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
||||
|
||||
/**
|
||||
* Describes the input token stream.
|
||||
*/
|
||||
|
||||
public class Token {
|
||||
|
||||
/**
|
||||
* An integer that describes the kind of this token. This numbering
|
||||
* system is determined by JavaCCParser, and a table of these numbers is
|
||||
* stored in the file ...Constants.java.
|
||||
*/
|
||||
public int kind;
|
||||
|
||||
/** The line number of the first character of this Token. */
|
||||
public int beginLine;
|
||||
/** The column number of the first character of this Token. */
|
||||
public int beginColumn;
|
||||
/** The line number of the last character of this Token. */
|
||||
public int endLine;
|
||||
/** The column number of the last character of this Token. */
|
||||
public int endColumn;
|
||||
|
||||
/**
|
||||
* The string image of the token.
|
||||
*/
|
||||
public String image;
|
||||
|
||||
/**
|
||||
* A reference to the next regular (non-special) token from the input
|
||||
* stream. If this is the last token from the input stream, or if the
|
||||
* token manager has not read tokens beyond this one, this field is
|
||||
* set to null. This is true only if this token is also a regular
|
||||
* token. Otherwise, see below for a description of the contents of
|
||||
* this field.
|
||||
*/
|
||||
public Token next;
|
||||
|
||||
/**
|
||||
* This field is used to access special tokens that occur prior to this
|
||||
* token, but after the immediately preceding regular (non-special) token.
|
||||
* If there are no such special tokens, this field is set to null.
|
||||
* When there are more than one such special token, this field refers
|
||||
* to the last of these special tokens, which in turn refers to the next
|
||||
* previous special token through its specialToken field, and so on
|
||||
* until the first special token (whose specialToken field is null).
|
||||
* The next fields of special tokens refer to other special tokens that
|
||||
* immediately follow it (without an intervening regular token). If there
|
||||
* is no such token, this field is null.
|
||||
*/
|
||||
public Token specialToken;
|
||||
|
||||
/**
|
||||
* An optional attribute value of the Token.
|
||||
* Tokens which are not used as syntactic sugar will often contain
|
||||
* meaningful values that will be used later on by the compiler or
|
||||
* interpreter. This attribute value is often different from the image.
|
||||
* Any subclass of Token that actually wants to return a non-null value can
|
||||
* override this method as appropriate.
|
||||
*/
|
||||
public Object getValue() {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* No-argument constructor
|
||||
*/
|
||||
public Token() {}
|
||||
|
||||
/**
|
||||
* Constructs a new token for the specified Image.
|
||||
*/
|
||||
public Token(int kind)
|
||||
{
|
||||
this(kind, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new token for the specified Image and Kind.
|
||||
*/
|
||||
public Token(int kind, String image)
|
||||
{
|
||||
this.kind = kind;
|
||||
this.image = image;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the image.
|
||||
*/
|
||||
public String toString()
|
||||
{
|
||||
return image;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new Token object, by default. However, if you want, you
|
||||
* can create and return subclass objects based on the value of ofKind.
|
||||
* Simply add the cases to the switch for all those special cases.
|
||||
* For example, if you have a subclass of Token called IDToken that
|
||||
* you want to create if ofKind is ID, simply add something like :
|
||||
*
|
||||
* case MyParserConstants.ID : return new IDToken(ofKind, image);
|
||||
*
|
||||
* to the following switch statement. Then you can cast matchedToken
|
||||
* variable to the appropriate type and use sit in your lexical actions.
|
||||
*/
|
||||
public static Token newToken(int ofKind, String image)
|
||||
{
|
||||
switch(ofKind)
|
||||
{
|
||||
default : return new Token(ofKind, image);
|
||||
}
|
||||
}
|
||||
|
||||
public static Token newToken(int ofKind)
|
||||
{
|
||||
return newToken(ofKind, null);
|
||||
}
|
||||
|
||||
}
|
||||
/* JavaCC - OriginalChecksum=24643dc85fd6daeec42ceba20b46ee61 (do not edit this line) */
|
@ -1,141 +0,0 @@
|
||||
/* Generated By:JavaCC: Do not edit this line. TokenMgrError.java Version 4.1 */
|
||||
/* JavaCCOptions: */
|
||||
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
||||
|
||||
/** Token Manager Error. */
|
||||
@SuppressWarnings("serial")
|
||||
public class TokenMgrError extends Error
|
||||
{
|
||||
|
||||
/*
|
||||
* Ordinals for various reasons why an Error of this type can be thrown.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Lexical error occurred.
|
||||
*/
|
||||
static final int LEXICAL_ERROR = 0;
|
||||
|
||||
/**
|
||||
* An attempt was made to create a second instance of a static token manager.
|
||||
*/
|
||||
static final int STATIC_LEXER_ERROR = 1;
|
||||
|
||||
/**
|
||||
* Tried to change to an invalid lexical state.
|
||||
*/
|
||||
static final int INVALID_LEXICAL_STATE = 2;
|
||||
|
||||
/**
|
||||
* Detected (and bailed out of) an infinite loop in the token manager.
|
||||
*/
|
||||
static final int LOOP_DETECTED = 3;
|
||||
|
||||
/**
|
||||
* Indicates the reason why the exception is thrown. It will have
|
||||
* one of the above 4 values.
|
||||
*/
|
||||
int errorCode;
|
||||
|
||||
/**
|
||||
* Replaces unprintable characters by their escaped (or unicode escaped)
|
||||
* equivalents in the given string
|
||||
*/
|
||||
protected static final String addEscapes(String str) {
|
||||
StringBuffer retval = new StringBuffer();
|
||||
char ch;
|
||||
for (int i = 0; i < str.length(); i++) {
|
||||
switch (str.charAt(i))
|
||||
{
|
||||
case 0 :
|
||||
continue;
|
||||
case '\b':
|
||||
retval.append("\\b");
|
||||
continue;
|
||||
case '\t':
|
||||
retval.append("\\t");
|
||||
continue;
|
||||
case '\n':
|
||||
retval.append("\\n");
|
||||
continue;
|
||||
case '\f':
|
||||
retval.append("\\f");
|
||||
continue;
|
||||
case '\r':
|
||||
retval.append("\\r");
|
||||
continue;
|
||||
case '\"':
|
||||
retval.append("\\\"");
|
||||
continue;
|
||||
case '\'':
|
||||
retval.append("\\\'");
|
||||
continue;
|
||||
case '\\':
|
||||
retval.append("\\\\");
|
||||
continue;
|
||||
default:
|
||||
if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
|
||||
String s = "0000" + Integer.toString(ch, 16);
|
||||
retval.append("\\u" + s.substring(s.length() - 4, s.length()));
|
||||
} else {
|
||||
retval.append(ch);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return retval.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a detailed message for the Error when it is thrown by the
|
||||
* token manager to indicate a lexical error.
|
||||
* Parameters :
|
||||
* EOFSeen : indicates if EOF caused the lexical error
|
||||
* curLexState : lexical state in which this error occurred
|
||||
* errorLine : line number when the error occurred
|
||||
* errorColumn : column number when the error occurred
|
||||
* errorAfter : prefix that was seen before this error occurred
|
||||
* curchar : the offending character
|
||||
* Note: You can customize the lexical error message by modifying this method.
|
||||
*/
|
||||
protected static String LexicalError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar) {
|
||||
return("Lexical error at line " +
|
||||
errorLine + ", column " +
|
||||
errorColumn + ". Encountered: " +
|
||||
(EOFSeen ? "<EOF> " : ("\"" + addEscapes(String.valueOf(curChar)) + "\"") + " (" + (int)curChar + "), ") +
|
||||
"after : \"" + addEscapes(errorAfter) + "\"");
|
||||
}
|
||||
|
||||
/**
|
||||
* You can also modify the body of this method to customize your error messages.
|
||||
* For example, cases like LOOP_DETECTED and INVALID_LEXICAL_STATE are not
|
||||
* of end-users concern, so you can return something like :
|
||||
*
|
||||
* "Internal Error : Please file a bug report .... "
|
||||
*
|
||||
* from this method for such cases in the release version of your parser.
|
||||
*/
|
||||
public String getMessage() {
|
||||
return super.getMessage();
|
||||
}
|
||||
|
||||
/*
|
||||
* Constructors of various flavors follow.
|
||||
*/
|
||||
|
||||
/** No arg constructor. */
|
||||
public TokenMgrError() {
|
||||
}
|
||||
|
||||
/** Constructor with message and reason. */
|
||||
public TokenMgrError(String message, int reason) {
|
||||
super(message);
|
||||
errorCode = reason;
|
||||
}
|
||||
|
||||
/** Full Constructor. */
|
||||
public TokenMgrError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar, int reason) {
|
||||
this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason);
|
||||
}
|
||||
}
|
||||
/* JavaCC - OriginalChecksum=538f0da130356fcc0bc7db621ab0389d (do not edit this line) */
|
@ -1,22 +0,0 @@
|
||||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Example html parser based on JavaCC
|
||||
</body>
|
||||
</html>
|
@ -1,181 +0,0 @@
|
||||
package org.apache.lucene.benchmark.byTask.utils;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* Implements a {@link Reader} over a {@link StringBuilder} instance. Although
|
||||
* one can use {@link java.io.StringReader} by passing it
|
||||
* {@link StringBuilder#toString()}, it is better to use this class, as it
|
||||
* doesn't mark the passed-in {@link StringBuilder} as shared (which will cause
|
||||
* inner char[] allocations at the next append() attempt).<br>
|
||||
* Notes:
|
||||
* <ul>
|
||||
* <li>This implementation assumes the underlying {@link StringBuilder} is not
|
||||
* changed during the use of this {@link Reader} implementation.
|
||||
* <li>This implementation is thread-safe.
|
||||
* <li>The implementation looks very much like {@link java.io.StringReader} (for
|
||||
* the right reasons).
|
||||
* <li>If one wants to reuse that instance, then the following needs to be done:
|
||||
* <pre>
|
||||
* StringBuilder sb = new StringBuilder("some text");
|
||||
* Reader reader = new StringBuilderReader(sb);
|
||||
* ... read from reader - don't close it ! ...
|
||||
* sb.setLength(0);
|
||||
* sb.append("some new text");
|
||||
* reader.reset();
|
||||
* ... read the new string from the reader ...
|
||||
* </pre>
|
||||
* </ul>
|
||||
*/
|
||||
public class StringBuilderReader extends Reader {
|
||||
|
||||
// The StringBuilder to read from.
|
||||
private StringBuilder sb;
|
||||
|
||||
// The length of 'sb'.
|
||||
private int length;
|
||||
|
||||
// The next position to read from the StringBuilder.
|
||||
private int next = 0;
|
||||
|
||||
// The mark position. The default value 0 means the start of the text.
|
||||
private int mark = 0;
|
||||
|
||||
public StringBuilderReader(StringBuilder sb) {
|
||||
set(sb);
|
||||
}
|
||||
|
||||
/** Check to make sure that the stream has not been closed. */
|
||||
private void ensureOpen() throws IOException {
|
||||
if (sb == null) {
|
||||
throw new IOException("Stream has already been closed");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
synchronized (lock) {
|
||||
sb = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark the present position in the stream. Subsequent calls to reset() will
|
||||
* reposition the stream to this point.
|
||||
*
|
||||
* @param readAheadLimit Limit on the number of characters that may be read
|
||||
* while still preserving the mark. Because the stream's input comes
|
||||
* from a StringBuilder, there is no actual limit, so this argument
|
||||
* must not be negative, but is otherwise ignored.
|
||||
* @exception IllegalArgumentException If readAheadLimit is < 0
|
||||
* @exception IOException If an I/O error occurs
|
||||
*/
|
||||
@Override
|
||||
public void mark(int readAheadLimit) throws IOException {
|
||||
if (readAheadLimit < 0){
|
||||
throw new IllegalArgumentException("Read-ahead limit cannpt be negative: " + readAheadLimit);
|
||||
}
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
mark = next;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean markSupported() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
return next >= length ? -1 : sb.charAt(next++);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char cbuf[], int off, int len) throws IOException {
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
|
||||
// Validate parameters
|
||||
if (off < 0 || off > cbuf.length || len < 0 || off + len > cbuf.length) {
|
||||
throw new IndexOutOfBoundsException("off=" + off + " len=" + len + " cbuf.length=" + cbuf.length);
|
||||
}
|
||||
|
||||
if (len == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (next >= length) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int n = Math.min(length - next, len);
|
||||
sb.getChars(next, next + n, cbuf, off);
|
||||
next += n;
|
||||
return n;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean ready() throws IOException {
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
next = mark;
|
||||
length = sb.length();
|
||||
}
|
||||
}
|
||||
|
||||
public void set(StringBuilder sb) {
|
||||
synchronized (lock) {
|
||||
this.sb = sb;
|
||||
length = sb.length();
|
||||
next = mark = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long skip(long ns) throws IOException {
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
if (next >= length) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Bound skip by beginning and end of the source
|
||||
long n = Math.min(length - next, ns);
|
||||
n = Math.max(-next, n);
|
||||
next += n;
|
||||
return n;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
||||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
@ -17,46 +17,46 @@ package org.apache.lucene.benchmark.byTask.feeds.demohtml;
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Locale;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser.Parser;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestHtmlParser extends LuceneTestCase {
|
||||
|
||||
public void testUnicode() throws Exception {
|
||||
String text = "<html><body>汉语</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertReadsTo("汉语", parser);
|
||||
Parser parser = new Parser(new StringReader(text));
|
||||
assertEquals("汉语", parser.body);
|
||||
}
|
||||
|
||||
public void testEntities() throws Exception {
|
||||
String text = "<html><body>汉语¥</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertReadsTo("汉语¥", parser);
|
||||
Parser parser = new Parser(new StringReader(text));
|
||||
assertEquals("汉语¥", parser.body);
|
||||
}
|
||||
|
||||
public void testComments() throws Exception {
|
||||
String text = "<html><body>foo<!-- bar --><! baz --></body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertReadsTo("foo", parser);
|
||||
Parser parser = new Parser(new StringReader(text));
|
||||
assertEquals("foo", parser.body);
|
||||
}
|
||||
|
||||
public void testScript() throws Exception {
|
||||
String text = "<html><body><script type=\"text/javascript\">" +
|
||||
"document.write(\"test\")</script>foo</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertReadsTo("foo", parser);
|
||||
Parser parser = new Parser(new StringReader(text));
|
||||
assertEquals("foo", parser.body);
|
||||
}
|
||||
|
||||
public void testStyle() throws Exception {
|
||||
String text = "<html><head><style type=\"text/css\">" +
|
||||
"body{background-color:blue;}</style>" +
|
||||
"</head><body>foo</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertReadsTo("foo", parser);
|
||||
Parser parser = new Parser(new StringReader(text));
|
||||
assertEquals("foo", parser.body);
|
||||
}
|
||||
|
||||
public void testDoctype() throws Exception {
|
||||
@ -64,8 +64,8 @@ public class TestHtmlParser extends LuceneTestCase {
|
||||
"\"-//W3C//DTD HTML 4.01 Transitional//EN\"" +
|
||||
"\"http://www.w3.org/TR/html4/loose.dtd\">" +
|
||||
"<html><body>foo</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertReadsTo("foo", parser);
|
||||
Parser parser = new Parser(new StringReader(text));
|
||||
assertEquals("foo", parser.body);
|
||||
}
|
||||
|
||||
public void testMeta() throws Exception {
|
||||
@ -75,58 +75,68 @@ public class TestHtmlParser extends LuceneTestCase {
|
||||
"<meta name=\"keywords\" content=\"this is a test\" />" +
|
||||
"<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\" />" +
|
||||
"</head><body>foobar</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
Properties tags = parser.getMetaTags();
|
||||
Parser parser = new Parser(new StringReader(text));
|
||||
Properties tags = parser.metaTags;
|
||||
assertEquals(4, tags.size());
|
||||
assertEquals("1", tags.get("a"));
|
||||
assertEquals("2", tags.get("b"));
|
||||
assertEquals("this is a test", tags.get("keywords"));
|
||||
assertEquals("text/html;charset=utf-8", tags.get("content-type"));
|
||||
assertEquals("text/html;charset=UTF-8", tags.get("content-type"));
|
||||
}
|
||||
|
||||
public void testTitle() throws Exception {
|
||||
String text = "<html><head><TITLE>foo</TITLE><head><body>bar</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertEquals("foo", parser.getTitle());
|
||||
}
|
||||
|
||||
public void testSummary() throws Exception {
|
||||
String text = "<html><head><TITLE>foo</TITLE><head><body>" +
|
||||
"Summarize me. Summarize me. Summarize me. Summarize me. " +
|
||||
"Summarize me. Summarize me. Summarize me. Summarize me. " +
|
||||
"Summarize me. Summarize me. Summarize me. Summarize me. " +
|
||||
"Summarize me. Summarize me. Summarize me. Summarize me. " +
|
||||
"Summarize me. Summarize me. Summarize me. Summarize me. " +
|
||||
"Summarize me. Summarize me. Summarize me. Summarize me. " +
|
||||
"Summarize me. Summarize me. Summarize me. Summarize me. " +
|
||||
"</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertEquals(200, parser.getSummary().length());
|
||||
}
|
||||
|
||||
// LUCENE-590
|
||||
public void testSummaryTitle() throws Exception {
|
||||
String text = "<html><head><title>Summary</title></head><body>Summary of the document</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertEquals("Summary of the document", parser.getSummary());
|
||||
Parser parser = new Parser(new StringReader(text));
|
||||
assertEquals("foo", parser.title);
|
||||
}
|
||||
|
||||
// LUCENE-2246
|
||||
public void testTurkish() throws Exception {
|
||||
String text = "<html><body>" +
|
||||
"<IMG SRC=\"../images/head.jpg\" WIDTH=570 HEIGHT=47 BORDER=0 ALT=\"ş\">" +
|
||||
"<a title=\"(ııı)\"></body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertReadsTo("[ş]", parser);
|
||||
final Locale saved = Locale.getDefault();
|
||||
try {
|
||||
Locale.setDefault(new Locale("tr", "TR"));
|
||||
String text = "<html><HEAD><TITLE>ııı</TITLE></head><body>" +
|
||||
"<IMG SRC=\"../images/head.jpg\" WIDTH=570 HEIGHT=47 BORDER=0 ALT=\"ş\">" +
|
||||
"<a title=\"(ııı)\"></body></html>";
|
||||
Parser parser = new Parser(new StringReader(text));
|
||||
assertEquals("ııı", parser.title);
|
||||
assertEquals("[ş]", parser.body);
|
||||
} finally {
|
||||
Locale.setDefault(saved);
|
||||
}
|
||||
}
|
||||
|
||||
private void assertReadsTo(String expected, HTMLParser parser) throws IOException {
|
||||
Reader reader = parser.getReader();
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1) {
|
||||
builder.append((char)ch);
|
||||
}
|
||||
assertEquals(expected, builder.toString());
|
||||
public void testSampleTRECDoc() throws Exception {
|
||||
String text = "<html>\r\n" +
|
||||
"\r\n" +
|
||||
"<head>\r\n" +
|
||||
"<title>\r\n" +
|
||||
"TEST-000 title\r\n" +
|
||||
"</title>\r\n" +
|
||||
"</head>\r\n" +
|
||||
"\r\n" +
|
||||
"<body>\r\n" +
|
||||
"TEST-000 text\r\n" +
|
||||
"\r\n" +
|
||||
"</body>\r\n" +
|
||||
"\r\n";
|
||||
Parser parser = new Parser(new StringReader(text));
|
||||
assertEquals("TEST-000 title", parser.title);
|
||||
assertEquals("TEST-000 text", parser.body.trim());
|
||||
}
|
||||
|
||||
public void testNoHTML() throws Exception {
|
||||
String text = "hallo";
|
||||
Parser parser = new Parser(new StringReader(text));
|
||||
assertEquals("", parser.title);
|
||||
assertEquals("hallo", parser.body);
|
||||
}
|
||||
|
||||
public void testivalid() throws Exception {
|
||||
String text = "<title>foo</title>bar";
|
||||
Parser parser = new Parser(new StringReader(text));
|
||||
assertEquals("foo", parser.title);
|
||||
assertEquals("bar", parser.body);
|
||||
}
|
||||
|
||||
}
|
@ -166,6 +166,7 @@ public class TrecContentSourceTest extends LuceneTestCase {
|
||||
"<title>\r\n" +
|
||||
"TEST-001 title\r\n" +
|
||||
"</title>\r\n" +
|
||||
"<meta name=\"date\" content=\"Tue, 09 Dec 2003 22:39:08 GMT\">" +
|
||||
"</head>\r\n" +
|
||||
"\r\n" +
|
||||
"<body>\r\n" +
|
||||
@ -183,7 +184,7 @@ public class TrecContentSourceTest extends LuceneTestCase {
|
||||
|
||||
dd = source.getNextDocData(dd);
|
||||
assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source
|
||||
.parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
|
||||
.parseDate("Tue, 09 Dec 2003 22:39:08 GMT"));
|
||||
|
||||
assertNoMoreDataException(source);
|
||||
}
|
||||
@ -331,6 +332,7 @@ public class TrecContentSourceTest extends LuceneTestCase {
|
||||
dd = source.getNextDocData(dd);
|
||||
assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", source
|
||||
.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
|
||||
source.close();
|
||||
|
||||
// Don't test that NoMoreDataException is thrown, since the forever flag is
|
||||
// turned on.
|
||||
|
Loading…
x
Reference in New Issue
Block a user