LUCENE-4220: Remove the buggy JavaCC-based HTML parser in the benchmark module and replaced by NekoHTML

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1361741 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2012-07-15 17:34:47 +00:00
parent 3377e98fdb
commit 67b1fdfc5d
31 changed files with 281 additions and 4397 deletions

View File

@ -102,6 +102,7 @@
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar"/>
<classpathentry kind="lib" path="lucene/benchmark/lib/commons-compress-1.2.jar"/>
<classpathentry kind="lib" path="lucene/benchmark/lib/xercesImpl-2.9.1.jar"/>
<classpathentry kind="lib" path="lucene/benchmark/lib/nekohtml-1.9.15.jar"/>
<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
<classpathentry kind="lib" path="solr/lib/commons-cli-1.2.jar"/>
<classpathentry kind="lib" path="solr/lib/httpclient-4.1.3.jar"/>

View File

@ -29,6 +29,10 @@ API Changes
make a custom FieldType and set indexed = true, its analyzed by the analyzer.
(Robert Muir)
* LUCENE-4220: Removed the buggy JavaCC-based HTML parser in the benchmark
module and replaced by NekoHTML. HTMLParser interface was cleaned up while
changing method signatures. (Uwe Schindler, Robert Muir)
Optimizations
* LUCENE-4171: Performance improvements to Packed64.

View File

@ -155,6 +155,7 @@
<fileset dir="lib">
<include name="commons-compress-1.2.jar"/>
<include name="xercesImpl-2.9.1.jar"/>
<include name="nekohtml-1.9.15.jar"/>
</fileset>
</path>
<path id="run.classpath">
@ -261,20 +262,6 @@
<target name="init" depends="module-build.init,resolve-icu,jar-memory,jar-highlighter,jar-analyzers-common,jar-queryparser,jar-facet"/>
<target name="clean-javacc">
<delete>
<fileset dir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml" includes="*.java">
<containsregexp expression="Generated.*By.*JavaCC"/>
</fileset>
</delete>
</target>
<target name="javacc" depends="init,javacc-check" if="javacc.present">
<invoke-javacc target="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj"
outputDir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml"
/>
</target>
<target name="compile-test" depends="copy-alg-files-for-testing,module-build.compile-test"/>
<target name="copy-alg-files-for-testing" description="copy .alg files as resources for testing">
<copy todir="${build.dir}/classes/test/conf">

View File

@ -21,6 +21,7 @@
<dependencies>
<dependency org="org.apache.commons" name="commons-compress" rev="1.2" transitive="false"/>
<dependency org="xerces" name="xercesImpl" rev="2.9.1" transitive="false"/>
<dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.15" transitive="false"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>
</ivy-module>

View File

@ -0,0 +1 @@
a45cd7b7401d9c2264d4908182380452c03ebf8f

View File

@ -19,51 +19,203 @@ package org.apache.lucene.benchmark.byTask.feeds;
import java.io.IOException;
import java.io.Reader;
import java.text.DateFormat;
import java.text.ParseException;
import java.io.StringReader;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;
import org.cyberneko.html.parsers.SAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* HTML Parser that is based on Lucene's demo HTML parser.
* Simple HTML Parser extracting title, meta tags, and body text
* that is based on <a href="http://nekohtml.sourceforge.net/">NekoHTML</a>.
*/
public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser {
public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
org.apache.lucene.benchmark.byTask.feeds.demohtml.HTMLParser p = new org.apache.lucene.benchmark.byTask.feeds.demohtml.HTMLParser(reader);
public class DemoHTMLParser implements HTMLParser {
/** The actual parser to read HTML documents */
public static final class Parser {
// title
if (title==null) {
title = p.getTitle();
public final Properties metaTags = new Properties();
public final String title, body;
public Parser(Reader reader) throws IOException, SAXException {
this(new InputSource(reader));
}
public Parser(InputSource source) throws IOException, SAXException {
final SAXParser parser = new SAXParser();
parser.setFeature("http://xml.org/sax/features/namespaces", true);
parser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
final StringBuilder title = new StringBuilder(), body = new StringBuilder();
final DefaultHandler handler = new DefaultHandler() {
private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0;
@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
if (inHEAD > 0) {
if (equalsIgnoreTurkish("title", localName)) {
inTITLE++;
} else {
if (equalsIgnoreTurkish("meta", localName)) {
String name = atts.getValue("name");
if (name == null) {
name = atts.getValue("http-equiv");
}
final String val = atts.getValue("content");
if (name != null && val != null) {
metaTags.setProperty(name.toLowerCase(Locale.ROOT), val);
}
}
}
} else if (inBODY > 0) {
if (SUPPRESS_ELEMENTS.contains(localName)) {
suppressed++;
} else if (equalsIgnoreTurkish("img", localName)) {
// the original javacc-based parser preserved <IMG alt="..."/>
// attribute as body text in [] parenthesis:
final String alt = atts.getValue("alt");
if (alt != null) {
body.append('[').append(alt).append(']');
}
}
} else if (equalsIgnoreTurkish("body", localName)) {
inBODY++;
} else if (equalsIgnoreTurkish("head", localName)) {
inHEAD++;
} else if (equalsIgnoreTurkish("frameset", localName)) {
throw new SAXException("This parser does not support HTML framesets.");
}
}
@Override
public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
if (inBODY > 0) {
if (equalsIgnoreTurkish("body", localName)) {
inBODY--;
} else if (ENDLINE_ELEMENTS.contains(localName)) {
body.append('\n');
} else if (SUPPRESS_ELEMENTS.contains(localName)) {
suppressed--;
}
} else if (inHEAD > 0) {
if (equalsIgnoreTurkish("head", localName)) {
inHEAD--;
} else if (inTITLE > 0 && equalsIgnoreTurkish("title", localName)) {
inTITLE--;
}
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (inBODY > 0 && suppressed == 0) {
body.append(ch, start, length);
} else if (inTITLE > 0) {
title.append(ch, start, length);
}
}
@Override
public InputSource resolveEntity(String publicId, String systemId) {
// disable network access caused by DTDs
return new InputSource(new StringReader(""));
}
};
parser.setContentHandler(handler);
parser.setErrorHandler(handler);
parser.parse(source);
// the javacc-based parser trimmed title (which should be done for HTML in all cases):
this.title = title.toString().trim();
// assign body text
this.body = body.toString();
}
// TODO: remove the Turkish workaround once this is fixed in NekoHTML:
// https://sourceforge.net/tracker/?func=detail&aid=3544334&group_id=195122&atid=952178
// BEGIN: workaround
static final String convertTurkish(String s) {
return s.replace('i', 'ı');
}
static final boolean equalsIgnoreTurkish(String s1, String s2) {
final int len1 = s1.length(), len2 = s2.length();
if (len1 != len2)
return false;
for (int i = 0; i < len1; i++) {
char ch1 = s1.charAt(i), ch2 = s2.charAt(i);
if (ch1 == 'ı') ch1 = 'i';
if (ch2 == 'ı') ch2 = 'i';
if (ch1 != ch2)
return false;
}
return true;
}
// END: workaround
static final Set<String> createElementNameSet(String... names) {
final HashSet<String> set = new HashSet<String>();
for (final String name : names) {
set.add(name);
set.add(convertTurkish(name));
}
return Collections.unmodifiableSet(set);
}
/** HTML elements that cause a line break (they are block-elements) */
static final Set<String> ENDLINE_ELEMENTS = createElementNameSet(
"p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
"pre", "hr", "blockquote", "address", "fieldset", "table", "form",
"noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option"
);
/** HTML elements with contents that are ignored */
static final Set<String> SUPPRESS_ELEMENTS = createElementNameSet(
"style", "script"
);
}
@Override
public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException {
try {
return parse(docData, name, date, new InputSource(reader), trecSrc);
} catch (SAXException saxe) {
throw new IOException("SAX exception occurred while parsing HTML document.", saxe);
}
}
public DocData parse(DocData docData, String name, Date date, InputSource source, TrecContentSource trecSrc) throws IOException, SAXException {
final Parser p = new Parser(source);
// properties
Properties props = p.getMetaTags();
// body
Reader r = p.getReader();
char c[] = new char[1024];
StringBuilder bodyBuf = new StringBuilder();
int n;
while ((n = r.read(c)) >= 0) {
if (n>0) {
bodyBuf.append(c,0,n);
}
}
r.close();
if (date == null && props.getProperty("date")!=null) {
try {
date = dateFormat.parse(props.getProperty("date").trim());
} catch (ParseException e) {
// do not fail test just because a date could not be parsed
System.out.println("ignoring date parse exception (assigning 'now') for: "+props.getProperty("date"));
date = new Date(); // now
final Properties props = p.metaTags;
String dateStr = props.getProperty("date");
if (dateStr != null) {
final Date newDate = trecSrc.parseDate(dateStr);
if (newDate != null) {
date = newDate;
}
}
docData.clear();
docData.setName(name);
docData.setBody(bodyBuf.toString());
docData.setTitle(title);
docData.setBody(p.body);
docData.setTitle(p.title);
docData.setProps(props);
docData.setDate(date);
return docData;

View File

@ -19,7 +19,6 @@ package org.apache.lucene.benchmark.byTask.feeds;
import java.io.IOException;
import java.io.Reader;
import java.text.DateFormat;
import java.util.Date;
/**
@ -34,13 +33,11 @@ public interface HTMLParser {
* @param docData result reused
* @param name name of the result doc data.
* @param date date of the result doc data. If null, attempt to set by parsed data.
* @param title title of the result doc data. If null, attempt to set by parsed data.
* @param reader reader of html text to parse.
* @param dateFormat date formatter to use for extracting the date.
* @param trecSrc the {@link TrecContentSource} used to parse dates.
* @return Parsed doc data.
* @throws IOException
* @throws InterruptedException
*/
public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException;
}

View File

@ -22,7 +22,6 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.text.DateFormat;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
@ -33,8 +32,6 @@ import java.util.Locale;
import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader;
import org.apache.lucene.util.ThreadInterruptedException;
/**
* Implements a {@link ContentSource} over the TREC collection.
@ -57,7 +54,7 @@ import org.apache.lucene.util.ThreadInterruptedException;
*/
public class TrecContentSource extends ContentSource {
private static final class DateFormatInfo {
static final class DateFormatInfo {
DateFormat[] dfs;
ParsePosition pos;
}
@ -83,13 +80,10 @@ public class TrecContentSource extends ContentSource {
};
private ThreadLocal<DateFormatInfo> dateFormats = new ThreadLocal<DateFormatInfo>();
private ThreadLocal<StringBuilderReader> trecDocReader = new ThreadLocal<StringBuilderReader>();
private ThreadLocal<StringBuilder> trecDocBuffer = new ThreadLocal<StringBuilder>();
private File dataDir = null;
private ArrayList<File> inputFiles = new ArrayList<File>();
private int nextFile = 0;
private int rawDocSize = 0;
// Use to synchronize threads on reading from the TREC documents.
private Object lock = new Object();
@ -126,17 +120,6 @@ public class TrecContentSource extends ContentSource {
return sb;
}
Reader getTrecDocReader(StringBuilder docBuffer) {
StringBuilderReader r = trecDocReader.get();
if (r == null) {
r = new StringBuilderReader(docBuffer);
trecDocReader.set(r);
} else {
r.set(docBuffer);
}
return r;
}
HTMLParser getHtmlParser() {
return htmlParser;
}
@ -161,7 +144,7 @@ public class TrecContentSource extends ContentSource {
continue;
}
rawDocSize += line.length();
line.length();
if (lineStart!=null && line.startsWith(lineStart)) {
if (collectMatchLine) {
@ -287,12 +270,8 @@ public class TrecContentSource extends ContentSource {
// This code segment relies on HtmlParser being thread safe. When we get
// here, everything else is already private to that thread, so we're safe.
try {
docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
addItem();
} catch (InterruptedException ie) {
throw new ThreadInterruptedException(ie);
}
docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
addItem();
return docData;
}

View File

@ -80,7 +80,7 @@ public abstract class TrecDocParser {
* parsers to alter their behavior according to the file path type.
*/
public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
StringBuilder docBuf, ParsePathType pathType) throws IOException;
/**
* strip tags from <code>buf</code>: each tag is replaced by a single blank.

View File

@ -37,7 +37,7 @@ public class TrecFBISParser extends TrecDocParser {
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
StringBuilder docBuf, ParsePathType pathType) throws IOException {
int mark = 0; // that much is skipped
// optionally skip some of the text, set date, title
Date date = null;

View File

@ -41,7 +41,7 @@ public class TrecFR94Parser extends TrecDocParser {
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
StringBuilder docBuf, ParsePathType pathType) throws IOException {
int mark = 0; // that much is skipped
// optionally skip some of the text, set date (no title?)
Date date = null;

View File

@ -33,7 +33,7 @@ public class TrecFTParser extends TrecDocParser {
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
StringBuilder docBuf, ParsePathType pathType) throws IOException {
int mark = 0; // that much is skipped
// date...

View File

@ -18,7 +18,7 @@ package org.apache.lucene.benchmark.byTask.feeds;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Date;
/**
@ -31,29 +31,24 @@ public class TrecGov2Parser extends TrecDocParser {
private static final String DOCHDR = "<DOCHDR>";
private static final String TERMINATING_DOCHDR = "</DOCHDR>";
private static final int TERMINATING_DOCHDR_LENGTH = TERMINATING_DOCHDR.length();
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
// Set up a (per-thread) reused Reader over the read content, reset it to re-read from docBuf
Reader r = trecSrc.getTrecDocReader(docBuf);
// skip some of the text, optionally set date
StringBuilder docBuf, ParsePathType pathType) throws IOException {
// skip some of the non-html text, optionally set date
Date date = null;
int h1 = docBuf.indexOf(DOCHDR);
if (h1>=0) {
int h2 = docBuf.indexOf(TERMINATING_DOCHDR,h1);
String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
int start = 0;
final int h1 = docBuf.indexOf(DOCHDR);
if (h1 >= 0) {
final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1);
final String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
if (dateStr != null) {
date = trecSrc.parseDate(dateStr);
}
r.mark(h2+TERMINATING_DOCHDR_LENGTH);
start = h2 + TERMINATING_DOCHDR.length();
}
r.reset();
HTMLParser htmlParser = trecSrc.getHtmlParser();
return htmlParser.parse(docData, name, date, null, r, null);
final String html = docBuf.substring(start);
return trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc);
}
}

View File

@ -36,7 +36,7 @@ public class TrecLATimesParser extends TrecDocParser {
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
StringBuilder docBuf, ParsePathType pathType) throws IOException {
int mark = 0; // that much is skipped
// date...

View File

@ -26,7 +26,7 @@ public class TrecParserByPath extends TrecDocParser {
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
StringBuilder docBuf, ParsePathType pathType) throws IOException {
return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType);
}

View File

@ -1,112 +0,0 @@
/* Generated By:JavaCC: Do not edit this line. CharStream.java Version 4.1 */
/* JavaCCOptions:STATIC=false */
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
/**
* This interface describes a character stream that maintains line and
* column number positions of the characters. It also has the capability
* to backup the stream to some extent. An implementation of this
* interface is used in the TokenManager implementation generated by
* JavaCCParser.
*
* All the methods except backup can be implemented in any fashion. backup
* needs to be implemented correctly for the correct operation of the lexer.
* Rest of the methods are all used to get information like line number,
* column number and the String that constitutes a token and are not used
* by the lexer. Hence their implementation won't affect the generated lexer's
* operation.
*/
public interface CharStream {
/**
* Returns the next character from the selected input. The method
* of selecting the input is the responsibility of the class
* implementing this interface. Can throw any java.io.IOException.
*/
char readChar() throws java.io.IOException;
/**
* Returns the column position of the character last read.
* @deprecated
* @see #getEndColumn
*/
int getColumn();
/**
* Returns the line number of the character last read.
* @deprecated
* @see #getEndLine
*/
int getLine();
/**
* Returns the column number of the last character for current token (being
* matched after the last call to BeginTOken).
*/
int getEndColumn();
/**
* Returns the line number of the last character for current token (being
* matched after the last call to BeginTOken).
*/
int getEndLine();
/**
* Returns the column number of the first character for current token (being
* matched after the last call to BeginTOken).
*/
int getBeginColumn();
/**
* Returns the line number of the first character for current token (being
* matched after the last call to BeginTOken).
*/
int getBeginLine();
/**
* Backs up the input stream by amount steps. Lexer calls this method if it
* had already read some characters, but could not use them to match a
* (longer) token. So, they will be used again as the prefix of the next
* token and it is the implemetation's responsibility to do this right.
*/
void backup(int amount);
/**
* Returns the next character that marks the beginning of the next token.
* All characters must remain in the buffer between two successive calls
* to this method to implement backup correctly.
*/
char BeginToken() throws java.io.IOException;
/**
* Returns a string made up of characters from the marked token beginning
* to the current buffer position. Implementations have the choice of returning
* anything that they want to. For example, for efficiency, one might decide
* to just return null, which is a valid implementation.
*/
String GetImage();
/**
* Returns an array of characters that make up the suffix of length 'len' for
* the currently matched token. This is used to build up the matched string
* for use in actions in the case of MORE. A simple and inefficient
* implementation of this is as follows :
*
* {
* String t = GetImage();
* return t.substring(t.length() - len, t.length()).toCharArray();
* }
*/
char[] GetSuffix(int len);
/**
* The lexer calls this function to indicate that it is done with the stream
* and hence implementations can free any resources held by this class.
* Again, the body of this function can be just empty and it will not
* affect the lexer's operation.
*/
void Done();
}
/* JavaCC - OriginalChecksum=e26d9399cd34335f985e19c1fa86c11b (do not edit this line) */

View File

@ -1,330 +0,0 @@
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashMap;
import java.util.Map;
/**
* Utility class for encoding and decoding HTML entities.
*/
public class Entities {
static final Map<String,String> decoder = new HashMap<String,String>(300);
static final String[] encoder = new String[0x100];
static final String decode(String entity) {
if (entity.charAt(entity.length()-1) == ';') // remove trailing semicolon
entity = entity.substring(0, entity.length()-1);
if (entity.charAt(1) == '#') {
int start = 2;
int radix = 10;
if (entity.charAt(2) == 'X' || entity.charAt(2) == 'x') {
start++;
radix = 16;
}
Character c =
new Character((char)Integer.parseInt(entity.substring(start), radix));
return c.toString();
} else {
String s = decoder.get(entity);
if (s != null)
return s;
else return "";
}
}
public static final String encode(String s) {
int length = s.length();
StringBuffer buffer = new StringBuffer(length * 2);
for (int i = 0; i < length; i++) {
int j = s.charAt(i);
if (j < 0x100 && encoder[j] != null) {
buffer.append(encoder[j]); // have a named encoding
buffer.append(';');
} else if (j < 0x80) {
buffer.append((char) j); // use ASCII value
} else {
buffer.append("&#"); // use numeric encoding
buffer.append(j).append(';');
}
}
return buffer.toString();
}
static final void add(String entity, int value) {
decoder.put(entity, (new Character((char)value)).toString());
if (value < 0x100)
encoder[value] = entity;
}
static {
add("&nbsp", 160);
add("&iexcl", 161);
add("&cent", 162);
add("&pound", 163);
add("&curren", 164);
add("&yen", 165);
add("&brvbar", 166);
add("&sect", 167);
add("&uml", 168);
add("&copy", 169);
add("&ordf", 170);
add("&laquo", 171);
add("&not", 172);
add("&shy", 173);
add("&reg", 174);
add("&macr", 175);
add("&deg", 176);
add("&plusmn", 177);
add("&sup2", 178);
add("&sup3", 179);
add("&acute", 180);
add("&micro", 181);
add("&para", 182);
add("&middot", 183);
add("&cedil", 184);
add("&sup1", 185);
add("&ordm", 186);
add("&raquo", 187);
add("&frac14", 188);
add("&frac12", 189);
add("&frac34", 190);
add("&iquest", 191);
add("&Agrave", 192);
add("&Aacute", 193);
add("&Acirc", 194);
add("&Atilde", 195);
add("&Auml", 196);
add("&Aring", 197);
add("&AElig", 198);
add("&Ccedil", 199);
add("&Egrave", 200);
add("&Eacute", 201);
add("&Ecirc", 202);
add("&Euml", 203);
add("&Igrave", 204);
add("&Iacute", 205);
add("&Icirc", 206);
add("&Iuml", 207);
add("&ETH", 208);
add("&Ntilde", 209);
add("&Ograve", 210);
add("&Oacute", 211);
add("&Ocirc", 212);
add("&Otilde", 213);
add("&Ouml", 214);
add("&times", 215);
add("&Oslash", 216);
add("&Ugrave", 217);
add("&Uacute", 218);
add("&Ucirc", 219);
add("&Uuml", 220);
add("&Yacute", 221);
add("&THORN", 222);
add("&szlig", 223);
add("&agrave", 224);
add("&aacute", 225);
add("&acirc", 226);
add("&atilde", 227);
add("&auml", 228);
add("&aring", 229);
add("&aelig", 230);
add("&ccedil", 231);
add("&egrave", 232);
add("&eacute", 233);
add("&ecirc", 234);
add("&euml", 235);
add("&igrave", 236);
add("&iacute", 237);
add("&icirc", 238);
add("&iuml", 239);
add("&eth", 240);
add("&ntilde", 241);
add("&ograve", 242);
add("&oacute", 243);
add("&ocirc", 244);
add("&otilde", 245);
add("&ouml", 246);
add("&divide", 247);
add("&oslash", 248);
add("&ugrave", 249);
add("&uacute", 250);
add("&ucirc", 251);
add("&uuml", 252);
add("&yacute", 253);
add("&thorn", 254);
add("&yuml", 255);
add("&fnof", 402);
add("&Alpha", 913);
add("&Beta", 914);
add("&Gamma", 915);
add("&Delta", 916);
add("&Epsilon",917);
add("&Zeta", 918);
add("&Eta", 919);
add("&Theta", 920);
add("&Iota", 921);
add("&Kappa", 922);
add("&Lambda", 923);
add("&Mu", 924);
add("&Nu", 925);
add("&Xi", 926);
add("&Omicron",927);
add("&Pi", 928);
add("&Rho", 929);
add("&Sigma", 931);
add("&Tau", 932);
add("&Upsilon",933);
add("&Phi", 934);
add("&Chi", 935);
add("&Psi", 936);
add("&Omega", 937);
add("&alpha", 945);
add("&beta", 946);
add("&gamma", 947);
add("&delta", 948);
add("&epsilon",949);
add("&zeta", 950);
add("&eta", 951);
add("&theta", 952);
add("&iota", 953);
add("&kappa", 954);
add("&lambda", 955);
add("&mu", 956);
add("&nu", 957);
add("&xi", 958);
add("&omicron",959);
add("&pi", 960);
add("&rho", 961);
add("&sigmaf", 962);
add("&sigma", 963);
add("&tau", 964);
add("&upsilon",965);
add("&phi", 966);
add("&chi", 967);
add("&psi", 968);
add("&omega", 969);
add("&thetasym",977);
add("&upsih", 978);
add("&piv", 982);
add("&bull", 8226);
add("&hellip", 8230);
add("&prime", 8242);
add("&Prime", 8243);
add("&oline", 8254);
add("&frasl", 8260);
add("&weierp", 8472);
add("&image", 8465);
add("&real", 8476);
add("&trade", 8482);
add("&alefsym",8501);
add("&larr", 8592);
add("&uarr", 8593);
add("&rarr", 8594);
add("&darr", 8595);
add("&harr", 8596);
add("&crarr", 8629);
add("&lArr", 8656);
add("&uArr", 8657);
add("&rArr", 8658);
add("&dArr", 8659);
add("&hArr", 8660);
add("&forall", 8704);
add("&part", 8706);
add("&exist", 8707);
add("&empty", 8709);
add("&nabla", 8711);
add("&isin", 8712);
add("&notin", 8713);
add("&ni", 8715);
add("&prod", 8719);
add("&sum", 8721);
add("&minus", 8722);
add("&lowast", 8727);
add("&radic", 8730);
add("&prop", 8733);
add("&infin", 8734);
add("&ang", 8736);
add("&and", 8743);
add("&or", 8744);
add("&cap", 8745);
add("&cup", 8746);
add("&int", 8747);
add("&there4", 8756);
add("&sim", 8764);
add("&cong", 8773);
add("&asymp", 8776);
add("&ne", 8800);
add("&equiv", 8801);
add("&le", 8804);
add("&ge", 8805);
add("&sub", 8834);
add("&sup", 8835);
add("&nsub", 8836);
add("&sube", 8838);
add("&supe", 8839);
add("&oplus", 8853);
add("&otimes", 8855);
add("&perp", 8869);
add("&sdot", 8901);
add("&lceil", 8968);
add("&rceil", 8969);
add("&lfloor", 8970);
add("&rfloor", 8971);
add("&lang", 9001);
add("&rang", 9002);
add("&loz", 9674);
add("&spades", 9824);
add("&clubs", 9827);
add("&hearts", 9829);
add("&diams", 9830);
add("&quot", 34);
add("&amp", 38);
add("&lt", 60);
add("&gt", 62);
add("&OElig", 338);
add("&oelig", 339);
add("&Scaron", 352);
add("&scaron", 353);
add("&Yuml", 376);
add("&circ", 710);
add("&tilde", 732);
add("&ensp", 8194);
add("&emsp", 8195);
add("&thinsp", 8201);
add("&zwnj", 8204);
add("&zwj", 8205);
add("&lrm", 8206);
add("&rlm", 8207);
add("&ndash", 8211);
add("&mdash", 8212);
add("&lsquo", 8216);
add("&rsquo", 8217);
add("&sbquo", 8218);
add("&ldquo", 8220);
add("&rdquo", 8221);
add("&bdquo", 8222);
add("&dagger", 8224);
add("&Dagger", 8225);
add("&permil", 8240);
add("&lsaquo", 8249);
add("&rsaquo", 8250);
add("&euro", 8364);
}
}

View File

@ -1,123 +0,0 @@
// FastCharStream.java
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
import java.io.*;
/** An efficient implementation of JavaCC's CharStream interface. <p>Note that
* this does not do line-number counting, but instead keeps track of the
* character position of the token in the input, as required by Lucene's {@link
* org.apache.lucene.analysis.Token} API.
* */
public final class FastCharStream implements CharStream {
char[] buffer = null;
int bufferLength = 0; // end of valid chars
int bufferPosition = 0; // next char to read
int tokenStart = 0; // offset in buffer
int bufferStart = 0; // position in file of buffer
Reader input; // source of chars
/** Constructs from a Reader. */
public FastCharStream(Reader r) {
input = r;
}
public final char readChar() throws IOException {
if (bufferPosition >= bufferLength)
refill();
return buffer[bufferPosition++];
}
private final void refill() throws IOException {
int newPosition = bufferLength - tokenStart;
if (tokenStart == 0) { // token won't fit in buffer
if (buffer == null) { // first time: alloc buffer
buffer = new char[2048];
} else if (bufferLength == buffer.length) { // grow buffer
char[] newBuffer = new char[buffer.length*2];
System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
buffer = newBuffer;
}
} else { // shift token to front
System.arraycopy(buffer, tokenStart, buffer, 0, newPosition);
}
bufferLength = newPosition; // update state
bufferPosition = newPosition;
bufferStart += tokenStart;
tokenStart = 0;
int charsRead = // fill space in buffer
input.read(buffer, newPosition, buffer.length-newPosition);
if (charsRead == -1)
throw new IOException("read past eof");
else
bufferLength += charsRead;
}
public final char BeginToken() throws IOException {
tokenStart = bufferPosition;
return readChar();
}
public final void backup(int amount) {
bufferPosition -= amount;
}
public final String GetImage() {
return new String(buffer, tokenStart, bufferPosition - tokenStart);
}
public final char[] GetSuffix(int len) {
char[] value = new char[len];
System.arraycopy(buffer, bufferPosition - len, value, 0, len);
return value;
}
public final void Done() {
try {
input.close();
} catch (IOException e) {
}
}
public final int getColumn() {
return bufferStart + bufferPosition;
}
public final int getLine() {
return 1;
}
public final int getEndColumn() {
return bufferStart + bufferPosition;
}
public final int getEndLine() {
return 1;
}
public final int getBeginColumn() {
return bufferStart + tokenStart;
}
public final int getBeginLine() {
return 1;
}
}

View File

@ -1,722 +0,0 @@
/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
import java.io.*;
import java.util.Locale;
import java.util.Properties;
/**
* Basic html parser (for demo/testing purposes only!)
*/
public class HTMLParser implements HTMLParserConstants {
public static int SUMMARY_LENGTH = 200;
StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
Properties metaTags=new Properties();
String currentMetaTag=null;
String currentMetaContent=null;
int length = 0;
boolean titleComplete = false;
boolean inTitle = false;
boolean inMetaTag = false;
boolean inStyle = false;
boolean afterTag = false;
boolean afterSpace = false;
String eol = System.getProperty("line.separator");
Reader pipeIn = null;
Writer pipeOut;
private MyPipedInputStream pipeInStream = null;
private PipedOutputStream pipeOutStream = null;
public HTMLParser(Reader reader) {
this(new FastCharStream(reader));
}
private class MyPipedInputStream extends PipedInputStream{
public MyPipedInputStream(){
super();
}
public MyPipedInputStream(PipedOutputStream src) throws IOException{
super(src);
}
public boolean full() throws IOException{
return this.available() >= PipedInputStream.PIPE_SIZE;
}
}
public String getTitle() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (titleComplete || pipeInStream.full())
break;
wait(10);
}
}
return title.toString().trim();
}
public Properties getMetaTags() throws IOException,
InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (titleComplete || pipeInStream.full())
break;
wait(10);
}
}
return metaTags;
}
public String getSummary() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
break;
wait(10);
}
}
if (summary.length() > SUMMARY_LENGTH)
summary.setLength(SUMMARY_LENGTH);
String sum = summary.toString().trim();
String tit = getTitle();
if (sum.equals(""))
return tit;
else
return sum;
}
public Reader getReader() throws IOException {
if (pipeIn == null) {
pipeInStream = new MyPipedInputStream();
pipeOutStream = new PipedOutputStream(pipeInStream);
pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
Thread thread = new ParserThread(this);
thread.start(); // start parsing
}
return pipeIn;
}
void addToSummary(String text) {
if (summary.length() < SUMMARY_LENGTH) {
summary.append(text);
if (summary.length() >= SUMMARY_LENGTH) {
synchronized(this) {
notifyAll();
}
}
}
}
void addText(String text) throws IOException {
if (inStyle)
return;
if (inTitle)
title.append(text);
else {
addToSummary(text);
if (!titleComplete && !(title.length() == 0)) { // finished title
synchronized(this) {
titleComplete = true; // tell waiting threads
notifyAll();
}
}
}
length += text.length();
pipeOut.write(text);
afterSpace = false;
}
void addMetaTag() {
metaTags.setProperty(currentMetaTag, currentMetaContent);
currentMetaTag = null;
currentMetaContent = null;
return;
}
void addSpace() throws IOException {
if (!afterSpace) {
if (inTitle)
title.append(" ");
else
addToSummary(" ");
String space = afterTag ? eol : " ";
length += space.length();
pipeOut.write(space);
afterSpace = true;
}
}
final public void HTMLDocument() throws ParseException, IOException {
Token t;
label_1:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ScriptStart:
case TagName:
case DeclName:
case Comment1:
case Comment2:
case Word:
case Entity:
case Space:
case Punct:
;
break;
default:
jj_la1[0] = jj_gen;
break label_1;
}
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case TagName:
Tag();
afterTag = true;
break;
case DeclName:
t = Decl();
afterTag = true;
break;
case Comment1:
case Comment2:
CommentTag();
afterTag = true;
break;
case ScriptStart:
ScriptTag();
afterTag = true;
break;
case Word:
t = jj_consume_token(Word);
addText(t.image); afterTag = false;
break;
case Entity:
t = jj_consume_token(Entity);
addText(Entities.decode(t.image)); afterTag = false;
break;
case Punct:
t = jj_consume_token(Punct);
addText(t.image); afterTag = false;
break;
case Space:
jj_consume_token(Space);
addSpace(); afterTag = false;
break;
default:
jj_la1[1] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
jj_consume_token(0);
}
final public void Tag() throws ParseException, IOException {
Token t1, t2;
boolean inImg = false;
t1 = jj_consume_token(TagName);
String tagName = t1.image.toLowerCase(Locale.ROOT);
if(Tags.WS_ELEMS.contains(tagName) ) {
addSpace();
}
inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
label_2:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgName:
;
break;
default:
jj_la1[2] = jj_gen;
break label_2;
}
t1 = jj_consume_token(ArgName);
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgEquals:
jj_consume_token(ArgEquals);
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgValue:
case ArgQuote1:
case ArgQuote2:
t2 = ArgValue();
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
addText("[" + t2.image + "]");
if(inMetaTag &&
( t1.image.equalsIgnoreCase("name") ||
t1.image.equalsIgnoreCase("HTTP-EQUIV")
)
&& t2 != null)
{
currentMetaTag=t2.image.toLowerCase(Locale.ROOT);
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
}
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
null)
{
currentMetaContent=t2.image.toLowerCase(Locale.ROOT);
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
}
break;
default:
jj_la1[3] = jj_gen;
;
}
break;
default:
jj_la1[4] = jj_gen;
;
}
}
jj_consume_token(TagEnd);
}
final public Token ArgValue() throws ParseException {
Token t = null;
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgValue:
t = jj_consume_token(ArgValue);
{if (true) return t;}
break;
default:
jj_la1[5] = jj_gen;
if (jj_2_1(2)) {
jj_consume_token(ArgQuote1);
jj_consume_token(CloseQuote1);
{if (true) return t;}
} else {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgQuote1:
jj_consume_token(ArgQuote1);
t = jj_consume_token(Quote1Text);
jj_consume_token(CloseQuote1);
{if (true) return t;}
break;
default:
jj_la1[6] = jj_gen;
if (jj_2_2(2)) {
jj_consume_token(ArgQuote2);
jj_consume_token(CloseQuote2);
{if (true) return t;}
} else {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgQuote2:
jj_consume_token(ArgQuote2);
t = jj_consume_token(Quote2Text);
jj_consume_token(CloseQuote2);
{if (true) return t;}
break;
default:
jj_la1[7] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
}
}
}
throw new Error("Missing return statement in function");
}
final public Token Decl() throws ParseException {
Token t;
t = jj_consume_token(DeclName);
label_3:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgName:
case ArgEquals:
case ArgValue:
case ArgQuote1:
case ArgQuote2:
;
break;
default:
jj_la1[8] = jj_gen;
break label_3;
}
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgName:
jj_consume_token(ArgName);
break;
case ArgValue:
case ArgQuote1:
case ArgQuote2:
ArgValue();
break;
case ArgEquals:
jj_consume_token(ArgEquals);
break;
default:
jj_la1[9] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
jj_consume_token(TagEnd);
{if (true) return t;}
throw new Error("Missing return statement in function");
}
final public void CommentTag() throws ParseException {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case Comment1:
jj_consume_token(Comment1);
label_4:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case CommentText1:
;
break;
default:
jj_la1[10] = jj_gen;
break label_4;
}
jj_consume_token(CommentText1);
}
jj_consume_token(CommentEnd1);
break;
case Comment2:
jj_consume_token(Comment2);
label_5:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case CommentText2:
;
break;
default:
jj_la1[11] = jj_gen;
break label_5;
}
jj_consume_token(CommentText2);
}
jj_consume_token(CommentEnd2);
break;
default:
jj_la1[12] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
final public void ScriptTag() throws ParseException {
jj_consume_token(ScriptStart);
label_6:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ScriptText:
;
break;
default:
jj_la1[13] = jj_gen;
break label_6;
}
jj_consume_token(ScriptText);
}
jj_consume_token(ScriptEnd);
}
private boolean jj_2_1(int xla) {
jj_la = xla; jj_lastpos = jj_scanpos = token;
try { return !jj_3_1(); }
catch(LookaheadSuccess ls) { return true; }
finally { jj_save(0, xla); }
}
private boolean jj_2_2(int xla) {
jj_la = xla; jj_lastpos = jj_scanpos = token;
try { return !jj_3_2(); }
catch(LookaheadSuccess ls) { return true; }
finally { jj_save(1, xla); }
}
private boolean jj_3_2() {
if (jj_scan_token(ArgQuote2)) return true;
if (jj_scan_token(CloseQuote2)) return true;
return false;
}
private boolean jj_3_1() {
if (jj_scan_token(ArgQuote1)) return true;
if (jj_scan_token(CloseQuote1)) return true;
return false;
}
/** Generated Token Manager. */
public HTMLParserTokenManager token_source;
/** Current token. */
public Token token;
/** Next token. */
public Token jj_nt;
private int jj_ntk;
private Token jj_scanpos, jj_lastpos;
private int jj_la;
private int jj_gen;
final private int[] jj_la1 = new int[14];
static private int[] jj_la1_0;
static {
jj_la1_init_0();
}
private static void jj_la1_init_0() {
jj_la1_0 = new int[] {0x2c7e,0x2c7e,0x10000,0x380000,0x20000,0x80000,0x100000,0x200000,0x3b0000,0x3b0000,0x8000000,0x20000000,0x30,0x4000,};
}
final private JJCalls[] jj_2_rtns = new JJCalls[2];
private boolean jj_rescan = false;
private int jj_gc = 0;
/** Constructor with user supplied CharStream. */
public HTMLParser(CharStream stream) {
token_source = new HTMLParserTokenManager(stream);
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
/** Reinitialise. */
public void ReInit(CharStream stream) {
token_source.ReInit(stream);
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
/** Constructor with generated Token Manager. */
public HTMLParser(HTMLParserTokenManager tm) {
token_source = tm;
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
/** Reinitialise. */
public void ReInit(HTMLParserTokenManager tm) {
token_source = tm;
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 14; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
private Token jj_consume_token(int kind) throws ParseException {
Token oldToken;
if ((oldToken = token).next != null) token = token.next;
else token = token.next = token_source.getNextToken();
jj_ntk = -1;
if (token.kind == kind) {
jj_gen++;
if (++jj_gc > 100) {
jj_gc = 0;
for (int i = 0; i < jj_2_rtns.length; i++) {
JJCalls c = jj_2_rtns[i];
while (c != null) {
if (c.gen < jj_gen) c.first = null;
c = c.next;
}
}
}
return token;
}
token = oldToken;
jj_kind = kind;
throw generateParseException();
}
static private final class LookaheadSuccess extends java.lang.Error { }
final private LookaheadSuccess jj_ls = new LookaheadSuccess();
private boolean jj_scan_token(int kind) {
if (jj_scanpos == jj_lastpos) {
jj_la--;
if (jj_scanpos.next == null) {
jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken();
} else {
jj_lastpos = jj_scanpos = jj_scanpos.next;
}
} else {
jj_scanpos = jj_scanpos.next;
}
if (jj_rescan) {
int i = 0; Token tok = token;
while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; }
if (tok != null) jj_add_error_token(kind, i);
}
if (jj_scanpos.kind != kind) return true;
if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls;
return false;
}
/** Get the next Token. */
final public Token getNextToken() {
if (token.next != null) token = token.next;
else token = token.next = token_source.getNextToken();
jj_ntk = -1;
jj_gen++;
return token;
}
/** Get the specific Token. */
final public Token getToken(int index) {
Token t = token;
for (int i = 0; i < index; i++) {
if (t.next != null) t = t.next;
else t = t.next = token_source.getNextToken();
}
return t;
}
private int jj_ntk() {
if ((jj_nt=token.next) == null)
return (jj_ntk = (token.next=token_source.getNextToken()).kind);
else
return (jj_ntk = jj_nt.kind);
}
private java.util.List jj_expentries = new java.util.ArrayList();
private int[] jj_expentry;
private int jj_kind = -1;
private int[] jj_lasttokens = new int[100];
private int jj_endpos;
private void jj_add_error_token(int kind, int pos) {
if (pos >= 100) return;
if (pos == jj_endpos + 1) {
jj_lasttokens[jj_endpos++] = kind;
} else if (jj_endpos != 0) {
jj_expentry = new int[jj_endpos];
for (int i = 0; i < jj_endpos; i++) {
jj_expentry[i] = jj_lasttokens[i];
}
jj_entries_loop: for (java.util.Iterator it = jj_expentries.iterator(); it.hasNext();) {
int[] oldentry = (int[])(it.next());
if (oldentry.length == jj_expentry.length) {
for (int i = 0; i < jj_expentry.length; i++) {
if (oldentry[i] != jj_expentry[i]) {
continue jj_entries_loop;
}
}
jj_expentries.add(jj_expentry);
break jj_entries_loop;
}
}
if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind;
}
}
/** Generate ParseException. */
public ParseException generateParseException() {
jj_expentries.clear();
boolean[] la1tokens = new boolean[31];
if (jj_kind >= 0) {
la1tokens[jj_kind] = true;
jj_kind = -1;
}
for (int i = 0; i < 14; i++) {
if (jj_la1[i] == jj_gen) {
for (int j = 0; j < 32; j++) {
if ((jj_la1_0[i] & (1<<j)) != 0) {
la1tokens[j] = true;
}
}
}
}
for (int i = 0; i < 31; i++) {
if (la1tokens[i]) {
jj_expentry = new int[1];
jj_expentry[0] = i;
jj_expentries.add(jj_expentry);
}
}
jj_endpos = 0;
jj_rescan_token();
jj_add_error_token(0, 0);
int[][] exptokseq = new int[jj_expentries.size()][];
for (int i = 0; i < jj_expentries.size(); i++) {
exptokseq[i] = (int[])jj_expentries.get(i);
}
return new ParseException(token, exptokseq, tokenImage);
}
/** Enable tracing. */
final public void enable_tracing() {
}
/** Disable tracing. */
final public void disable_tracing() {
}
private void jj_rescan_token() {
jj_rescan = true;
for (int i = 0; i < 2; i++) {
try {
JJCalls p = jj_2_rtns[i];
do {
if (p.gen > jj_gen) {
jj_la = p.arg; jj_lastpos = jj_scanpos = p.first;
switch (i) {
case 0: jj_3_1(); break;
case 1: jj_3_2(); break;
}
}
p = p.next;
} while (p != null);
} catch(LookaheadSuccess ls) { }
}
jj_rescan = false;
}
private void jj_save(int index, int xla) {
JJCalls p = jj_2_rtns[index];
while (p.gen > jj_gen) {
if (p.next == null) { p = p.next = new JJCalls(); break; }
p = p.next;
}
p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla;
}
static final class JJCalls {
int gen;
Token first;
int arg;
JJCalls next;
}
// void handleException(Exception e) {
// System.out.println(e.toString()); // print the error message
// System.out.println("Skipping...");
// Token t;
// do {
// t = getNextToken();
// } while (t.kind != TagEnd);
// }
}

View File

@ -1,394 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// HTMLParser.jj
options {
STATIC = false;
//DEBUG_LOOKAHEAD = true;
//DEBUG_TOKEN_MANAGER = true;
UNICODE_INPUT = true;
USER_CHAR_STREAM=true;
}
PARSER_BEGIN(HTMLParser)
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
import java.io.*;
import java.util.Locale;
import java.util.Properties;
/**
* Basic html parser (for demo/testing purposes only!)
*/
public class HTMLParser {
public static int SUMMARY_LENGTH = 200;
StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
Properties metaTags=new Properties();
String currentMetaTag=null;
String currentMetaContent=null;
int length = 0;
boolean titleComplete = false;
boolean inTitle = false;
boolean inMetaTag = false;
boolean inStyle = false;
boolean afterTag = false;
boolean afterSpace = false;
String eol = System.getProperty("line.separator");
Reader pipeIn = null;
Writer pipeOut;
private MyPipedInputStream pipeInStream = null;
private PipedOutputStream pipeOutStream = null;
public HTMLParser(Reader reader) {
this(new FastCharStream(reader));
}
private class MyPipedInputStream extends PipedInputStream{
public MyPipedInputStream(){
super();
}
public MyPipedInputStream(PipedOutputStream src) throws IOException{
super(src);
}
public boolean full() throws IOException{
return this.available() >= PipedInputStream.PIPE_SIZE;
}
}
public String getTitle() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (titleComplete || pipeInStream.full())
break;
wait(10);
}
}
return title.toString().trim();
}
public Properties getMetaTags() throws IOException,
InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (titleComplete || pipeInStream.full())
break;
wait(10);
}
}
return metaTags;
}
public String getSummary() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
break;
wait(10);
}
}
if (summary.length() > SUMMARY_LENGTH)
summary.setLength(SUMMARY_LENGTH);
String sum = summary.toString().trim();
String tit = getTitle();
if (sum.equals(""))
return tit;
else
return sum;
}
public Reader getReader() throws IOException {
if (pipeIn == null) {
pipeInStream = new MyPipedInputStream();
pipeOutStream = new PipedOutputStream(pipeInStream);
pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
Thread thread = new ParserThread(this);
thread.start(); // start parsing
}
return pipeIn;
}
void addToSummary(String text) {
if (summary.length() < SUMMARY_LENGTH) {
summary.append(text);
if (summary.length() >= SUMMARY_LENGTH) {
synchronized(this) {
notifyAll();
}
}
}
}
void addText(String text) throws IOException {
if (inStyle)
return;
if (inTitle)
title.append(text);
else {
addToSummary(text);
if (!titleComplete && !(title.length() == 0)) { // finished title
synchronized(this) {
titleComplete = true; // tell waiting threads
notifyAll();
}
}
}
length += text.length();
pipeOut.write(text);
afterSpace = false;
}
void addMetaTag() {
metaTags.setProperty(currentMetaTag, currentMetaContent);
currentMetaTag = null;
currentMetaContent = null;
return;
}
void addSpace() throws IOException {
if (!afterSpace) {
if (inTitle)
title.append(" ");
else
addToSummary(" ");
String space = afterTag ? eol : " ";
length += space.length();
pipeOut.write(space);
afterSpace = true;
}
}
// void handleException(Exception e) {
// System.out.println(e.toString()); // print the error message
// System.out.println("Skipping...");
// Token t;
// do {
// t = getNextToken();
// } while (t.kind != TagEnd);
// }
}
PARSER_END(HTMLParser)
void HTMLDocument() throws IOException :
{
Token t;
}
{
// try {
( Tag() { afterTag = true; }
| t=Decl() { afterTag = true; }
| CommentTag() { afterTag = true; }
| ScriptTag() { afterTag = true; }
| t=<Word> { addText(t.image); afterTag = false; }
| t=<Entity> { addText(Entities.decode(t.image)); afterTag = false; }
| t=<Punct> { addText(t.image); afterTag = false; }
| <Space> { addSpace(); afterTag = false; }
)* <EOF>
// } catch (ParseException e) {
// handleException(e);
// }
}
void Tag() throws IOException :
{
Token t1, t2;
boolean inImg = false;
}
{
t1=<TagName> {
String tagName = t1.image.toLowerCase(Locale.ROOT);
if(Tags.WS_ELEMS.contains(tagName) ) {
addSpace();
}
inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
}
(t1=<ArgName>
(<ArgEquals>
(t2=ArgValue() // save ALT text in IMG tag
{
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
addText("[" + t2.image + "]");
if(inMetaTag &&
( t1.image.equalsIgnoreCase("name") ||
t1.image.equalsIgnoreCase("HTTP-EQUIV")
)
&& t2 != null)
{
currentMetaTag=t2.image.toLowerCase(Locale.ROOT);
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
}
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
null)
{
currentMetaContent=t2.image.toLowerCase(Locale.ROOT);
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
}
}
)?
)?
)*
<TagEnd>
}
Token ArgValue() :
{
Token t = null;
}
{
t=<ArgValue> { return t; }
| LOOKAHEAD(2)
<ArgQuote1> <CloseQuote1> { return t; }
| <ArgQuote1> t=<Quote1Text> <CloseQuote1> { return t; }
| LOOKAHEAD(2)
<ArgQuote2> <CloseQuote2> { return t; }
| <ArgQuote2> t=<Quote2Text> <CloseQuote2> { return t; }
}
Token Decl() :
{
Token t;
}
{
t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd>
{ return t; }
}
void CommentTag() :
{}
{
(<Comment1> ( <CommentText1> )* <CommentEnd1>)
|
(<Comment2> ( <CommentText2> )* <CommentEnd2>)
}
void ScriptTag() :
{}
{
<ScriptStart> ( <ScriptText> )* <ScriptEnd>
}
TOKEN :
{
< ScriptStart: "<script" > : WithinScript
| < TagName: "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
| < DeclName: "<" "!" ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
| < Comment1: "<!--" > : WithinComment1
| < Comment2: "<!" > : WithinComment2
| < Word: ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] |
<LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM> )+ >
| < #LET: ["A"-"Z","a"-"z","0"-"9"] >
| < #NUM: ["0"-"9"] >
| < #HEX: ["0"-"9","A"-"F","a"-"f"] >
| < Entity: ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+ (";")? | "&" "#" ["X","x"] (<HEX>)+ (";")? ) >
| < Space: (<SP>)+ >
| < #SP: [" ","\t","\r","\n"] >
| < Punct: ~[] > // Keep this last. It is a catch-all.
}
<WithinScript> TOKEN:
{
< ScriptText: (~["<",">"])+ | "<" | ">" >
| < ScriptEnd: "</script" (~["<",">"])* ">" > : DEFAULT
}
<WithinTag> TOKEN:
{
< ArgName: (~[" ","\t","\r","\n","=",">","'","\""])
(~[" ","\t","\r","\n","=",">"])* >
| < ArgEquals: "=" > : AfterEquals
| < TagEnd: ">" | "=>" > : DEFAULT
}
<AfterEquals> TOKEN:
{
< ArgValue: (~[" ","\t","\r","\n","=",">","'","\""])
(~[" ","\t","\r","\n",">"])* > : WithinTag
}
<WithinTag, AfterEquals> TOKEN:
{
< ArgQuote1: "'" > : WithinQuote1
| < ArgQuote2: "\"" > : WithinQuote2
}
<WithinTag, AfterEquals> SKIP:
{
< <Space> >
}
<WithinQuote1> TOKEN:
{
< Quote1Text: (~["'"])+ >
| < CloseQuote1: <ArgQuote1> > : WithinTag
}
<WithinQuote2> TOKEN:
{
< Quote2Text: (~["\""])+ >
| < CloseQuote2: <ArgQuote2> > : WithinTag
}
<WithinComment1> TOKEN :
{
< CommentText1: (~["-"])+ | "-" >
| < CommentEnd1: "-->" > : DEFAULT
}
<WithinComment2> TOKEN :
{
< CommentText2: (~[">"])+ >
| < CommentEnd2: ">" > : DEFAULT
}

View File

@ -1,124 +0,0 @@
/* Generated By:JavaCC: Do not edit this line. HTMLParserConstants.java */
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
/**
* Token literal values and constants.
* Generated by org.javacc.parser.OtherFilesGen#start()
*/
public interface HTMLParserConstants {
/** End of File. */
int EOF = 0;
/** RegularExpression Id. */
int ScriptStart = 1;
/** RegularExpression Id. */
int TagName = 2;
/** RegularExpression Id. */
int DeclName = 3;
/** RegularExpression Id. */
int Comment1 = 4;
/** RegularExpression Id. */
int Comment2 = 5;
/** RegularExpression Id. */
int Word = 6;
/** RegularExpression Id. */
int LET = 7;
/** RegularExpression Id. */
int NUM = 8;
/** RegularExpression Id. */
int HEX = 9;
/** RegularExpression Id. */
int Entity = 10;
/** RegularExpression Id. */
int Space = 11;
/** RegularExpression Id. */
int SP = 12;
/** RegularExpression Id. */
int Punct = 13;
/** RegularExpression Id. */
int ScriptText = 14;
/** RegularExpression Id. */
int ScriptEnd = 15;
/** RegularExpression Id. */
int ArgName = 16;
/** RegularExpression Id. */
int ArgEquals = 17;
/** RegularExpression Id. */
int TagEnd = 18;
/** RegularExpression Id. */
int ArgValue = 19;
/** RegularExpression Id. */
int ArgQuote1 = 20;
/** RegularExpression Id. */
int ArgQuote2 = 21;
/** RegularExpression Id. */
int Quote1Text = 23;
/** RegularExpression Id. */
int CloseQuote1 = 24;
/** RegularExpression Id. */
int Quote2Text = 25;
/** RegularExpression Id. */
int CloseQuote2 = 26;
/** RegularExpression Id. */
int CommentText1 = 27;
/** RegularExpression Id. */
int CommentEnd1 = 28;
/** RegularExpression Id. */
int CommentText2 = 29;
/** RegularExpression Id. */
int CommentEnd2 = 30;
/** Lexical state. */
int DEFAULT = 0;
/** Lexical state. */
int WithinScript = 1;
/** Lexical state. */
int WithinTag = 2;
/** Lexical state. */
int AfterEquals = 3;
/** Lexical state. */
int WithinQuote1 = 4;
/** Lexical state. */
int WithinQuote2 = 5;
/** Lexical state. */
int WithinComment1 = 6;
/** Lexical state. */
int WithinComment2 = 7;
/** Literal token values. */
String[] tokenImage = {
"<EOF>",
"\"<script\"",
"<TagName>",
"<DeclName>",
"\"<!--\"",
"\"<!\"",
"<Word>",
"<LET>",
"<NUM>",
"<HEX>",
"<Entity>",
"<Space>",
"<SP>",
"<Punct>",
"<ScriptText>",
"<ScriptEnd>",
"<ArgName>",
"\"=\"",
"<TagEnd>",
"<ArgValue>",
"\"\\\'\"",
"\"\\\"\"",
"<token of kind 22>",
"<Quote1Text>",
"<CloseQuote1>",
"<Quote2Text>",
"<CloseQuote2>",
"<CommentText1>",
"\"-->\"",
"<CommentText2>",
"\">\"",
};
}

View File

@ -1,198 +0,0 @@
/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 4.1 */
/* JavaCCOptions:KEEP_LINE_COL=null */
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
/**
* This exception is thrown when parse errors are encountered.
* You can explicitly create objects of this exception type by
* calling the method generateParseException in the generated
* parser.
*
* You can modify this class to customize your error reporting
* mechanisms so long as you retain the public fields.
*/
public class ParseException extends Exception {
/**
* This constructor is used by the method "generateParseException"
* in the generated parser. Calling this constructor generates
* a new object of this type with the fields "currentToken",
* "expectedTokenSequences", and "tokenImage" set. The boolean
* flag "specialConstructor" is also set to true to indicate that
* this constructor was used to create this object.
* This constructor calls its super class with the empty string
* to force the "toString" method of parent class "Throwable" to
* print the error message in the form:
* ParseException: <result of getMessage>
*/
public ParseException(Token currentTokenVal,
int[][] expectedTokenSequencesVal,
String[] tokenImageVal
)
{
super("");
specialConstructor = true;
currentToken = currentTokenVal;
expectedTokenSequences = expectedTokenSequencesVal;
tokenImage = tokenImageVal;
}
/**
* The following constructors are for use by you for whatever
* purpose you can think of. Constructing the exception in this
* manner makes the exception behave in the normal way - i.e., as
* documented in the class "Throwable". The fields "errorToken",
* "expectedTokenSequences", and "tokenImage" do not contain
* relevant information. The JavaCC generated code does not use
* these constructors.
*/
public ParseException() {
super();
specialConstructor = false;
}
/** Constructor with message. */
public ParseException(String message) {
super(message);
specialConstructor = false;
}
/**
* This variable determines which constructor was used to create
* this object and thereby affects the semantics of the
* "getMessage" method (see below).
*/
protected boolean specialConstructor;
/**
* This is the last token that has been consumed successfully. If
* this object has been created due to a parse error, the token
* followng this token will (therefore) be the first error token.
*/
public Token currentToken;
/**
* Each entry in this array is an array of integers. Each array
* of integers represents a sequence of tokens (by their ordinal
* values) that is expected at this point of the parse.
*/
public int[][] expectedTokenSequences;
/**
* This is a reference to the "tokenImage" array of the generated
* parser within which the parse error occurred. This array is
* defined in the generated ...Constants interface.
*/
public String[] tokenImage;
/**
* This method has the standard behavior when this object has been
* created using the standard constructors. Otherwise, it uses
* "currentToken" and "expectedTokenSequences" to generate a parse
* error message and returns it. If this object has been created
* due to a parse error, and you do not catch it (it gets thrown
* from the parser), then this method is called during the printing
* of the final stack trace, and hence the correct error message
* gets displayed.
*/
public String getMessage() {
if (!specialConstructor) {
return super.getMessage();
}
StringBuffer expected = new StringBuffer();
int maxSize = 0;
for (int i = 0; i < expectedTokenSequences.length; i++) {
if (maxSize < expectedTokenSequences[i].length) {
maxSize = expectedTokenSequences[i].length;
}
for (int j = 0; j < expectedTokenSequences[i].length; j++) {
expected.append(tokenImage[expectedTokenSequences[i][j]]).append(' ');
}
if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) {
expected.append("...");
}
expected.append(eol).append(" ");
}
String retval = "Encountered \"";
Token tok = currentToken.next;
for (int i = 0; i < maxSize; i++) {
if (i != 0) retval += " ";
if (tok.kind == 0) {
retval += tokenImage[0];
break;
}
retval += " " + tokenImage[tok.kind];
retval += " \"";
retval += add_escapes(tok.image);
retval += " \"";
tok = tok.next;
}
retval += "\" at line " + currentToken.next.beginLine + ", column " + currentToken.next.beginColumn;
retval += "." + eol;
if (expectedTokenSequences.length == 1) {
retval += "Was expecting:" + eol + " ";
} else {
retval += "Was expecting one of:" + eol + " ";
}
retval += expected.toString();
return retval;
}
/**
* The end of line string for this machine.
*/
protected String eol = System.getProperty("line.separator", "\n");
/**
* Used to convert raw characters to their escaped version
* when these raw version cannot be used as part of an ASCII
* string literal.
*/
protected String add_escapes(String str) {
StringBuffer retval = new StringBuffer();
char ch;
for (int i = 0; i < str.length(); i++) {
switch (str.charAt(i))
{
case 0 :
continue;
case '\b':
retval.append("\\b");
continue;
case '\t':
retval.append("\\t");
continue;
case '\n':
retval.append("\\n");
continue;
case '\f':
retval.append("\\f");
continue;
case '\r':
retval.append("\\r");
continue;
case '\"':
retval.append("\\\"");
continue;
case '\'':
retval.append("\\\'");
continue;
case '\\':
retval.append("\\\\");
continue;
default:
if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
String s = "0000" + Integer.toString(ch, 16);
retval.append("\\u" + s.substring(s.length() - 4, s.length()));
} else {
retval.append(ch);
}
continue;
}
}
return retval.toString();
}
}
/* JavaCC - OriginalChecksum=e449d0e43f3d85deb1260a88b7e90fcd (do not edit this line) */

View File

@ -1,50 +0,0 @@
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.*;
class ParserThread extends Thread {
HTMLParser parser;
ParserThread(HTMLParser p) {
parser = p;
}
@Override
public void run() { // convert pipeOut to pipeIn
try {
try { // parse document to pipeOut
parser.HTMLDocument();
} catch (ParseException e) {
System.out.println("Parse Aborted: " + e.getMessage());
} catch (TokenMgrError e) {
System.out.println("Parse Aborted: " + e.getMessage());
} finally {
parser.pipeOut.close();
synchronized (parser) {
parser.summary.setLength(HTMLParser.SUMMARY_LENGTH);
parser.titleComplete = true;
parser.notifyAll();
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}

View File

@ -1,67 +0,0 @@
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashSet;
import java.util.Set;
/**
* Utility class storing set of commonly-used html tags.
*/
public final class Tags {
/**
* contains all tags for which whitespaces have to be inserted for proper tokenization
*/
public static final Set<String> WS_ELEMS;
static{
WS_ELEMS = new HashSet<String>();
WS_ELEMS.add("<hr");
WS_ELEMS.add("<hr/"); // note that "<hr />" does not need to be listed explicitly
WS_ELEMS.add("<br");
WS_ELEMS.add("<br/");
WS_ELEMS.add("<p");
WS_ELEMS.add("</p");
WS_ELEMS.add("<div");
WS_ELEMS.add("</div");
WS_ELEMS.add("<td");
WS_ELEMS.add("</td");
WS_ELEMS.add("<li");
WS_ELEMS.add("</li");
WS_ELEMS.add("<q");
WS_ELEMS.add("</q");
WS_ELEMS.add("<blockquote");
WS_ELEMS.add("</blockquote");
WS_ELEMS.add("<dt");
WS_ELEMS.add("</dt");
WS_ELEMS.add("<h1");
WS_ELEMS.add("</h1");
WS_ELEMS.add("<h2");
WS_ELEMS.add("</h2");
WS_ELEMS.add("<h3");
WS_ELEMS.add("</h3");
WS_ELEMS.add("<h4");
WS_ELEMS.add("</h4");
WS_ELEMS.add("<h5");
WS_ELEMS.add("</h5");
WS_ELEMS.add("<h6");
WS_ELEMS.add("</h6");
}
}

View File

@ -1,124 +0,0 @@
/* Generated By:JavaCC: Do not edit this line. Token.java Version 4.1 */
/* JavaCCOptions:TOKEN_EXTENDS=,KEEP_LINE_COL=null */
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
/**
* Describes the input token stream.
*/
public class Token {
/**
* An integer that describes the kind of this token. This numbering
* system is determined by JavaCCParser, and a table of these numbers is
* stored in the file ...Constants.java.
*/
public int kind;
/** The line number of the first character of this Token. */
public int beginLine;
/** The column number of the first character of this Token. */
public int beginColumn;
/** The line number of the last character of this Token. */
public int endLine;
/** The column number of the last character of this Token. */
public int endColumn;
/**
* The string image of the token.
*/
public String image;
/**
* A reference to the next regular (non-special) token from the input
* stream. If this is the last token from the input stream, or if the
* token manager has not read tokens beyond this one, this field is
* set to null. This is true only if this token is also a regular
* token. Otherwise, see below for a description of the contents of
* this field.
*/
public Token next;
/**
* This field is used to access special tokens that occur prior to this
* token, but after the immediately preceding regular (non-special) token.
* If there are no such special tokens, this field is set to null.
* When there are more than one such special token, this field refers
* to the last of these special tokens, which in turn refers to the next
* previous special token through its specialToken field, and so on
* until the first special token (whose specialToken field is null).
* The next fields of special tokens refer to other special tokens that
* immediately follow it (without an intervening regular token). If there
* is no such token, this field is null.
*/
public Token specialToken;
/**
* An optional attribute value of the Token.
* Tokens which are not used as syntactic sugar will often contain
* meaningful values that will be used later on by the compiler or
* interpreter. This attribute value is often different from the image.
* Any subclass of Token that actually wants to return a non-null value can
* override this method as appropriate.
*/
public Object getValue() {
return null;
}
/**
* No-argument constructor
*/
public Token() {}
/**
* Constructs a new token for the specified Image.
*/
public Token(int kind)
{
this(kind, null);
}
/**
* Constructs a new token for the specified Image and Kind.
*/
public Token(int kind, String image)
{
this.kind = kind;
this.image = image;
}
/**
* Returns the image.
*/
public String toString()
{
return image;
}
/**
* Returns a new Token object, by default. However, if you want, you
* can create and return subclass objects based on the value of ofKind.
* Simply add the cases to the switch for all those special cases.
* For example, if you have a subclass of Token called IDToken that
* you want to create if ofKind is ID, simply add something like :
*
* case MyParserConstants.ID : return new IDToken(ofKind, image);
*
* to the following switch statement. Then you can cast matchedToken
* variable to the appropriate type and use sit in your lexical actions.
*/
public static Token newToken(int ofKind, String image)
{
switch(ofKind)
{
default : return new Token(ofKind, image);
}
}
public static Token newToken(int ofKind)
{
return newToken(ofKind, null);
}
}
/* JavaCC - OriginalChecksum=24643dc85fd6daeec42ceba20b46ee61 (do not edit this line) */

View File

@ -1,141 +0,0 @@
/* Generated By:JavaCC: Do not edit this line. TokenMgrError.java Version 4.1 */
/* JavaCCOptions: */
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
/** Token Manager Error. */
@SuppressWarnings("serial")
public class TokenMgrError extends Error
{
/*
* Ordinals for various reasons why an Error of this type can be thrown.
*/
/**
* Lexical error occurred.
*/
static final int LEXICAL_ERROR = 0;
/**
* An attempt was made to create a second instance of a static token manager.
*/
static final int STATIC_LEXER_ERROR = 1;
/**
* Tried to change to an invalid lexical state.
*/
static final int INVALID_LEXICAL_STATE = 2;
/**
* Detected (and bailed out of) an infinite loop in the token manager.
*/
static final int LOOP_DETECTED = 3;
/**
* Indicates the reason why the exception is thrown. It will have
* one of the above 4 values.
*/
int errorCode;
/**
* Replaces unprintable characters by their escaped (or unicode escaped)
* equivalents in the given string
*/
protected static final String addEscapes(String str) {
StringBuffer retval = new StringBuffer();
char ch;
for (int i = 0; i < str.length(); i++) {
switch (str.charAt(i))
{
case 0 :
continue;
case '\b':
retval.append("\\b");
continue;
case '\t':
retval.append("\\t");
continue;
case '\n':
retval.append("\\n");
continue;
case '\f':
retval.append("\\f");
continue;
case '\r':
retval.append("\\r");
continue;
case '\"':
retval.append("\\\"");
continue;
case '\'':
retval.append("\\\'");
continue;
case '\\':
retval.append("\\\\");
continue;
default:
if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
String s = "0000" + Integer.toString(ch, 16);
retval.append("\\u" + s.substring(s.length() - 4, s.length()));
} else {
retval.append(ch);
}
continue;
}
}
return retval.toString();
}
/**
* Returns a detailed message for the Error when it is thrown by the
* token manager to indicate a lexical error.
* Parameters :
* EOFSeen : indicates if EOF caused the lexical error
* curLexState : lexical state in which this error occurred
* errorLine : line number when the error occurred
* errorColumn : column number when the error occurred
* errorAfter : prefix that was seen before this error occurred
* curchar : the offending character
* Note: You can customize the lexical error message by modifying this method.
*/
protected static String LexicalError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar) {
return("Lexical error at line " +
errorLine + ", column " +
errorColumn + ". Encountered: " +
(EOFSeen ? "<EOF> " : ("\"" + addEscapes(String.valueOf(curChar)) + "\"") + " (" + (int)curChar + "), ") +
"after : \"" + addEscapes(errorAfter) + "\"");
}
/**
* You can also modify the body of this method to customize your error messages.
* For example, cases like LOOP_DETECTED and INVALID_LEXICAL_STATE are not
* of end-users concern, so you can return something like :
*
* "Internal Error : Please file a bug report .... "
*
* from this method for such cases in the release version of your parser.
*/
public String getMessage() {
return super.getMessage();
}
/*
* Constructors of various flavors follow.
*/
/** No arg constructor. */
public TokenMgrError() {
}
/** Constructor with message and reason. */
public TokenMgrError(String message, int reason) {
super(message);
errorCode = reason;
}
/** Full Constructor. */
public TokenMgrError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar, int reason) {
this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason);
}
}
/* JavaCC - OriginalChecksum=538f0da130356fcc0bc7db621ab0389d (do not edit this line) */

View File

@ -1,22 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Example html parser based on JavaCC
</body>
</html>

View File

@ -1,181 +0,0 @@
package org.apache.lucene.benchmark.byTask.utils;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
/**
* Implements a {@link Reader} over a {@link StringBuilder} instance. Although
* one can use {@link java.io.StringReader} by passing it
* {@link StringBuilder#toString()}, it is better to use this class, as it
* doesn't mark the passed-in {@link StringBuilder} as shared (which will cause
* inner char[] allocations at the next append() attempt).<br>
* Notes:
* <ul>
* <li>This implementation assumes the underlying {@link StringBuilder} is not
* changed during the use of this {@link Reader} implementation.
* <li>This implementation is thread-safe.
* <li>The implementation looks very much like {@link java.io.StringReader} (for
* the right reasons).
* <li>If one wants to reuse that instance, then the following needs to be done:
* <pre>
* StringBuilder sb = new StringBuilder("some text");
* Reader reader = new StringBuilderReader(sb);
* ... read from reader - don't close it ! ...
* sb.setLength(0);
* sb.append("some new text");
* reader.reset();
* ... read the new string from the reader ...
* </pre>
* </ul>
*/
public class StringBuilderReader extends Reader {
// The StringBuilder to read from.
private StringBuilder sb;
// The length of 'sb'.
private int length;
// The next position to read from the StringBuilder.
private int next = 0;
// The mark position. The default value 0 means the start of the text.
private int mark = 0;
public StringBuilderReader(StringBuilder sb) {
set(sb);
}
/** Check to make sure that the stream has not been closed. */
private void ensureOpen() throws IOException {
if (sb == null) {
throw new IOException("Stream has already been closed");
}
}
@Override
public void close() {
synchronized (lock) {
sb = null;
}
}
/**
* Mark the present position in the stream. Subsequent calls to reset() will
* reposition the stream to this point.
*
* @param readAheadLimit Limit on the number of characters that may be read
* while still preserving the mark. Because the stream's input comes
* from a StringBuilder, there is no actual limit, so this argument
* must not be negative, but is otherwise ignored.
* @exception IllegalArgumentException If readAheadLimit is < 0
* @exception IOException If an I/O error occurs
*/
@Override
public void mark(int readAheadLimit) throws IOException {
if (readAheadLimit < 0){
throw new IllegalArgumentException("Read-ahead limit cannpt be negative: " + readAheadLimit);
}
synchronized (lock) {
ensureOpen();
mark = next;
}
}
@Override
public boolean markSupported() {
return true;
}
@Override
public int read() throws IOException {
synchronized (lock) {
ensureOpen();
return next >= length ? -1 : sb.charAt(next++);
}
}
@Override
public int read(char cbuf[], int off, int len) throws IOException {
synchronized (lock) {
ensureOpen();
// Validate parameters
if (off < 0 || off > cbuf.length || len < 0 || off + len > cbuf.length) {
throw new IndexOutOfBoundsException("off=" + off + " len=" + len + " cbuf.length=" + cbuf.length);
}
if (len == 0) {
return 0;
}
if (next >= length) {
return -1;
}
int n = Math.min(length - next, len);
sb.getChars(next, next + n, cbuf, off);
next += n;
return n;
}
}
@Override
public boolean ready() throws IOException {
synchronized (lock) {
ensureOpen();
return true;
}
}
@Override
public void reset() throws IOException {
synchronized (lock) {
ensureOpen();
next = mark;
length = sb.length();
}
}
public void set(StringBuilder sb) {
synchronized (lock) {
this.sb = sb;
length = sb.length();
next = mark = 0;
}
}
@Override
public long skip(long ns) throws IOException {
synchronized (lock) {
ensureOpen();
if (next >= length) {
return 0;
}
// Bound skip by beginning and end of the source
long n = Math.min(length - next, ns);
n = Math.max(-next, n);
next += n;
return n;
}
}
}

View File

@ -1,4 +1,4 @@
package org.apache.lucene.benchmark.byTask.feeds.demohtml;
package org.apache.lucene.benchmark.byTask.feeds;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -17,46 +17,46 @@ package org.apache.lucene.benchmark.byTask.feeds.demohtml;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Locale;
import java.util.Properties;
import org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser.Parser;
import org.apache.lucene.util.LuceneTestCase;
public class TestHtmlParser extends LuceneTestCase {
public void testUnicode() throws Exception {
String text = "<html><body>汉语</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertReadsTo("汉语", parser);
Parser parser = new Parser(new StringReader(text));
assertEquals("汉语", parser.body);
}
public void testEntities() throws Exception {
String text = "<html><body>&#x6C49;&#x8BED;&yen;</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertReadsTo("汉语¥", parser);
Parser parser = new Parser(new StringReader(text));
assertEquals("汉语¥", parser.body);
}
public void testComments() throws Exception {
String text = "<html><body>foo<!-- bar --><! baz --></body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertReadsTo("foo", parser);
Parser parser = new Parser(new StringReader(text));
assertEquals("foo", parser.body);
}
public void testScript() throws Exception {
String text = "<html><body><script type=\"text/javascript\">" +
"document.write(\"test\")</script>foo</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertReadsTo("foo", parser);
Parser parser = new Parser(new StringReader(text));
assertEquals("foo", parser.body);
}
public void testStyle() throws Exception {
String text = "<html><head><style type=\"text/css\">" +
"body{background-color:blue;}</style>" +
"</head><body>foo</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertReadsTo("foo", parser);
Parser parser = new Parser(new StringReader(text));
assertEquals("foo", parser.body);
}
public void testDoctype() throws Exception {
@ -64,8 +64,8 @@ public class TestHtmlParser extends LuceneTestCase {
"\"-//W3C//DTD HTML 4.01 Transitional//EN\"" +
"\"http://www.w3.org/TR/html4/loose.dtd\">" +
"<html><body>foo</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertReadsTo("foo", parser);
Parser parser = new Parser(new StringReader(text));
assertEquals("foo", parser.body);
}
public void testMeta() throws Exception {
@ -75,58 +75,68 @@ public class TestHtmlParser extends LuceneTestCase {
"<meta name=\"keywords\" content=\"this is a test\" />" +
"<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\" />" +
"</head><body>foobar</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
Properties tags = parser.getMetaTags();
Parser parser = new Parser(new StringReader(text));
Properties tags = parser.metaTags;
assertEquals(4, tags.size());
assertEquals("1", tags.get("a"));
assertEquals("2", tags.get("b"));
assertEquals("this is a test", tags.get("keywords"));
assertEquals("text/html;charset=utf-8", tags.get("content-type"));
assertEquals("text/html;charset=UTF-8", tags.get("content-type"));
}
public void testTitle() throws Exception {
String text = "<html><head><TITLE>foo</TITLE><head><body>bar</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertEquals("foo", parser.getTitle());
}
public void testSummary() throws Exception {
String text = "<html><head><TITLE>foo</TITLE><head><body>" +
"Summarize me. Summarize me. Summarize me. Summarize me. " +
"Summarize me. Summarize me. Summarize me. Summarize me. " +
"Summarize me. Summarize me. Summarize me. Summarize me. " +
"Summarize me. Summarize me. Summarize me. Summarize me. " +
"Summarize me. Summarize me. Summarize me. Summarize me. " +
"Summarize me. Summarize me. Summarize me. Summarize me. " +
"Summarize me. Summarize me. Summarize me. Summarize me. " +
"</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertEquals(200, parser.getSummary().length());
}
// LUCENE-590
public void testSummaryTitle() throws Exception {
String text = "<html><head><title>Summary</title></head><body>Summary of the document</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertEquals("Summary of the document", parser.getSummary());
Parser parser = new Parser(new StringReader(text));
assertEquals("foo", parser.title);
}
// LUCENE-2246
public void testTurkish() throws Exception {
String text = "<html><body>" +
"<IMG SRC=\"../images/head.jpg\" WIDTH=570 HEIGHT=47 BORDER=0 ALT=\"ş\">" +
"<a title=\"(ııı)\"></body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertReadsTo("[ş]", parser);
final Locale saved = Locale.getDefault();
try {
Locale.setDefault(new Locale("tr", "TR"));
String text = "<html><HEAD><TITLE>ııı</TITLE></head><body>" +
"<IMG SRC=\"../images/head.jpg\" WIDTH=570 HEIGHT=47 BORDER=0 ALT=\"ş\">" +
"<a title=\"(ııı)\"></body></html>";
Parser parser = new Parser(new StringReader(text));
assertEquals("ııı", parser.title);
assertEquals("[ş]", parser.body);
} finally {
Locale.setDefault(saved);
}
}
private void assertReadsTo(String expected, HTMLParser parser) throws IOException {
Reader reader = parser.getReader();
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1) {
builder.append((char)ch);
}
assertEquals(expected, builder.toString());
public void testSampleTRECDoc() throws Exception {
String text = "<html>\r\n" +
"\r\n" +
"<head>\r\n" +
"<title>\r\n" +
"TEST-000 title\r\n" +
"</title>\r\n" +
"</head>\r\n" +
"\r\n" +
"<body>\r\n" +
"TEST-000 text\r\n" +
"\r\n" +
"</body>\r\n" +
"\r\n";
Parser parser = new Parser(new StringReader(text));
assertEquals("TEST-000 title", parser.title);
assertEquals("TEST-000 text", parser.body.trim());
}
public void testNoHTML() throws Exception {
String text = "hallo";
Parser parser = new Parser(new StringReader(text));
assertEquals("", parser.title);
assertEquals("hallo", parser.body);
}
public void testivalid() throws Exception {
String text = "<title>foo</title>bar";
Parser parser = new Parser(new StringReader(text));
assertEquals("foo", parser.title);
assertEquals("bar", parser.body);
}
}

View File

@ -166,6 +166,7 @@ public class TrecContentSourceTest extends LuceneTestCase {
"<title>\r\n" +
"TEST-001 title\r\n" +
"</title>\r\n" +
"<meta name=\"date\" content=\"Tue&#44; 09 Dec 2003 22&#58;39&#58;08 GMT\">" +
"</head>\r\n" +
"\r\n" +
"<body>\r\n" +
@ -183,7 +184,7 @@ public class TrecContentSourceTest extends LuceneTestCase {
dd = source.getNextDocData(dd);
assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source
.parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
.parseDate("Tue, 09 Dec 2003 22:39:08 GMT"));
assertNoMoreDataException(source);
}
@ -331,6 +332,7 @@ public class TrecContentSourceTest extends LuceneTestCase {
dd = source.getNextDocData(dd);
assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", source
.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
source.close();
// Don't test that NoMoreDataException is thrown, since the forever flag is
// turned on.