LUCENE-971: extract wikipedia documents as a doc maker directly from XML file without using intermediate one-file-per-document

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@564151 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2007-08-09 08:57:26 +00:00
parent 2d954694dc
commit d1422ebd6b
7 changed files with 279 additions and 232 deletions

View File

@ -4,6 +4,12 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$
8/9/07
LUCENE-971: Change enwiki tasks to a doc maker (extending
LineDocMaker) that directly processes the Wikipedia XML and produces
documents. Intermediate files (one per document) are no longer
created.
8/1/07
LUCENE-967: Add "ReadTokensTask" to allow for benchmarking just tokenization.

View File

@ -23,7 +23,7 @@
<available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
<available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
<available file="${working.dir}/enwiki" property="enwiki.extracted"/>
<available file="${working.dir}/enwiki.txt" property="enwiki.extracted"/>
</target>
@ -31,7 +31,6 @@
<mkdir dir="temp"/>
<antcall target="get-enwiki"/>
<antcall target="expand-enwiki"/>
<antcall target="extract-enwiki"/>
</target>
<target name="get-enwiki" unless="enwiki.exists">
@ -43,14 +42,6 @@
<bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
</target>
<target name="extract-enwiki" depends="check-files" unless="enwiki.extracted">
<mkdir dir="${working.dir}/enwiki"/>
<java classname="org.apache.lucene.benchmark.utils.ExtractWikipedia" maxmemory="1024M" fork="true">
<classpath refid="run.classpath"/>
<arg line="temp/enwiki-20070527-pages-articles.xml ${working.dir}/enwiki"/>
</java>
</target>
<target name="get-news-20" unless="20news-18828.exists">
<get src="http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz"
dest="temp/news20.tar.gz"/>
@ -164,7 +155,7 @@
<enable/>
</assertions>
<classpath refid="run.classpath"/>
<arg line="conf/wikipedia.alg"/>
<arg line="conf/extractWikipedia.alg"/>
</java>
</target>

View File

@ -0,0 +1,44 @@
#/**
# * Licensed to the Apache Software Foundation (ASF) under one or more
# * contributor license agreements. See the NOTICE file distributed with
# * this work for additional information regarding copyright ownership.
# * The ASF licenses this file to You under the Apache License, Version 2.0
# * (the "License"); you may not use this file except in compliance with
# * the License. You may obtain a copy of the License at
# *
# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# * See the License for the specific language governing permissions and
# * limitations under the License.
# */
# -------------------------------------------------------------------------------------
#
# This alg will process the Wikipedia documents feed to produce a
# single file that contains all documents, one per line.
#
# To use this, first cd to contrib/benchmark and then run:
#
# ant run-task -Dtask.alg=conf/extractWikipedia.alg
#
# Then, to index the documents in the line file, see
# indexLineFile.alg.
#
# Where to get documents from:
doc.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker
docs.file=temp/enwiki-20070527-pages-articles.xml
# Where to write the line file output:
line.file.out=work/enwiki.txt
# Stop after processing the document feed once:
doc.maker.forever=false
# -------------------------------------------------------------------------------------
# Process all documents, appending each one to the line file:
{WriteLineDoc() > : *

View File

@ -0,0 +1,213 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.xml.sax.XMLReader;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
import java.io.IOException;
import org.apache.lucene.document.Document;
/**
* A LineDocMaker which reads the uncompressed english wikipedia dump.
*/
public class EnwikiDocMaker extends LineDocMaker {
static final int TITLE = 0;
static final int DATE = TITLE+1;
static final int BODY = DATE+1;
static final int LENGTH = BODY+1;
static final String[] months = {"JAN", "FEB", "MAR", "APR",
"MAY", "JUN", "JUL", "AUG",
"SEP", "OCT", "NOV", "DEC"};
class Parser extends DefaultHandler implements Runnable {
Thread t;
public void run() {
try {
XMLReader reader =
XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
reader.setContentHandler(this);
reader.setErrorHandler(this);
while(true){
InputSource is = new InputSource(fileIS);
reader.parse(is);
if (!forever) {
synchronized(this) {
nmde = new NoMoreDataException();
notify();
}
return;
} else {
synchronized(this){
openFile();
}
}
}
} catch (SAXException sae) {
throw new RuntimeException(sae);
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
Parser() {
t = new Thread(this);
t.setDaemon(true);
t.start();
}
String[] tuple;
NoMoreDataException nmde;
String[] next() throws NoMoreDataException {
String[] result;
synchronized(this){
while(tuple == null && nmde == null){
try {
wait();
} catch (InterruptedException ie) {
}
}
if (nmde != null) {
throw nmde;
}
result = tuple;
tuple = null;
notify();
}
return result;
}
StringBuffer contents = new StringBuffer();
public void characters(char[] ch, int start, int length) {
contents.append(ch, start, length);
}
String title;
String body;
String time;
static final int BASE = 10;
public void startElement(String namespace,
String simple,
String qualified,
Attributes attributes) {
if (qualified.equals("page")) {
title = null;
body = null;
time = null;
} else if (qualified.equals("text")) {
contents.setLength(0);
} else if (qualified.equals("timestamp")) {
contents.setLength(0);
} else if (qualified.equals("title")) {
contents.setLength(0);
}
}
String time(String original) {
StringBuffer buffer = new StringBuffer();
buffer.append(original.substring(8, 10));
buffer.append('-');
buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
buffer.append('-');
buffer.append(original.substring(0, 4));
buffer.append(' ');
buffer.append(original.substring(11, 19));
buffer.append(".000");
return buffer.toString();
}
public void create(String title, String time, String body) {
String[] t = new String[LENGTH];
t[TITLE] = title.replace('\t', ' ');
t[DATE] = time.replace('\t', ' ');
t[BODY] = body.replaceAll("[\t\n]", " ");
synchronized(this) {
while(tuple!=null) {
try {
wait();
} catch (InterruptedException ie) {
}
}
tuple = t;
notify();
}
}
public void endElement(String namespace, String simple, String qualified)
throws SAXException {
if (qualified.equals("title")) {
title = contents.toString();
} else if (qualified.equals("text")) {
body = contents.toString();
if (body.startsWith("#REDIRECT") ||
body.startsWith("#redirect")) {
body = null;
}
} else if (qualified.equals("timestamp")) {
time = time(contents.toString());
} else if (qualified.equals("page")) {
if (body != null) {
create(title, time, body);
}
}
}
}
Parser parser = new Parser();
class DocState extends LineDocMaker.DocState {
public Document setFields(String[] tuple) {
titleField.setValue(tuple[TITLE]);
dateField.setValue(tuple[DATE]);
bodyField.setValue(tuple[BODY]);
return doc;
}
}
private DocState getDocState() {
DocState ds = (DocState) docState.get();
if (ds == null) {
ds = new DocState();
docState.set(ds);
}
return ds;
}
public Document makeDocument() throws Exception {
String[] tuple = parser.next();
return getDocState().setFields(tuple);
}
}

View File

@ -25,7 +25,8 @@ import org.apache.lucene.document.Field;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.FileReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
/**
* A DocMaker reading one line at a time as a Document from
@ -39,13 +40,14 @@ import java.io.FileReader;
*/
public class LineDocMaker extends BasicDocMaker {
private BufferedReader fileIn;
private ThreadLocal docState = new ThreadLocal();
FileInputStream fileIS;
BufferedReader fileIn;
ThreadLocal docState = new ThreadLocal();
private String fileName;
private static int READER_BUFFER_BYTES = 64*1024;
private class DocState {
class DocState {
Document doc;
Field bodyField;
Field titleField;
@ -63,7 +65,7 @@ public class LineDocMaker extends BasicDocMaker {
storeVal,
Field.Index.TOKENIZED,
termVecVal);
dateField = new Field(BasicDocMaker.TITLE_FIELD,
dateField = new Field(BasicDocMaker.DATE_FIELD,
"",
storeVal,
Field.Index.TOKENIZED,
@ -143,11 +145,12 @@ public class LineDocMaker extends BasicDocMaker {
openFile();
}
private void openFile() {
void openFile() {
try {
if (fileIn != null)
fileIn.close();
fileIn = new BufferedReader(new FileReader(fileName), READER_BUFFER_BYTES);
fileIS = new FileInputStream(fileName);
fileIn = new BufferedReader(new InputStreamReader(fileIS,"UTF-8"), READER_BUFFER_BYTES);
} catch (IOException e) {
throw new RuntimeException(e);
}

View File

@ -18,7 +18,8 @@ package org.apache.lucene.benchmark.byTask.tasks;
*/
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
@ -59,7 +60,7 @@ public class WriteLineDocTask extends PerfTask {
String fileName = config.get("line.file.out", null);
if (fileName == null)
throw new Exception("line.file.out must be set");
lineFileOut = new BufferedWriter(new FileWriter(fileName));
lineFileOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName),"UTF-8"));
}
docMaker = getRunData().getDocMaker();
}

View File

@ -1,211 +0,0 @@
package org.apache.lucene.benchmark.utils;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
/**
* Extract the downloaded Wikipedia dump into separate files for indexing.
*/
public class ExtractWikipedia {
private File wikipedia;
private File outputDir;
public ExtractWikipedia(File wikipedia, File outputDir) {
this.wikipedia = wikipedia;
this.outputDir = outputDir;
System.out.println("Deleting all files in " + outputDir);
File [] files = outputDir.listFiles();
for (int i = 0; i < files.length; i++) {
files[i].delete();
}
}
static public int count = 0;
static String[] months = {"JAN", "FEB", "MAR", "APR",
"MAY", "JUN", "JUL", "AUG",
"SEP", "OCT", "NOV", "DEC"};
public class Parser extends DefaultHandler {
public Parser() {
}
StringBuffer contents = new StringBuffer();
public void characters(char[] ch, int start, int length) {
contents.append(ch, start, length);
}
String title;
String id;
String body;
String time;
static final int BASE = 10;
public void startElement(String namespace,
String simple,
String qualified,
Attributes attributes) {
if (qualified.equals("page")) {
title = null;
id = null;
body = null;
time = null;
} else if (qualified.equals("text")) {
contents.setLength(0);
} else if (qualified.equals("timestamp")) {
contents.setLength(0);
} else if (qualified.equals("title")) {
contents.setLength(0);
} else if (qualified.equals("id")) {
contents.setLength(0);
}
}
public File directory (int count, File directory) {
if (directory == null) {
directory = outputDir;
}
int base = BASE;
while (base <= count) {
base *= BASE;
}
if (count < BASE) {
return directory;
}
directory = new File (directory, (Integer.toString(base / BASE)));
directory = new File (directory, (Integer.toString(count / (base / BASE))));
return directory(count % (base / BASE), directory);
}
public void create(String id, String title, String time, String body) {
File d = directory(count++, null);
d.mkdirs();
File f = new File(d, id + ".txt");
StringBuffer contents = new StringBuffer();
contents.append(time);
contents.append("\n\n");
contents.append(title);
contents.append("\n\n");
contents.append(body);
contents.append("\n");
try {
FileWriter writer = new FileWriter(f);
writer.write(contents.toString());
writer.close();
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
String time(String original) {
StringBuffer buffer = new StringBuffer();
buffer.append(original.substring(8, 10));
buffer.append('-');
buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
buffer.append('-');
buffer.append(original.substring(0, 4));
buffer.append(' ');
buffer.append(original.substring(11, 19));
buffer.append(".000");
return buffer.toString();
}
public void endElement(String namespace, String simple, String qualified) {
if (qualified.equals("title")) {
title = contents.toString();
} else if (qualified.equals("text")) {
body = contents.toString();
if (body.startsWith("#REDIRECT") ||
body.startsWith("#redirect")) {
body = null;
}
} else if (qualified.equals("timestamp")) {
time = time(contents.toString());
} else if (qualified.equals("id") && id == null) {
id = contents.toString();
} else if (qualified.equals("page")) {
if (body != null) {
create(id, title, time, body);
}
}
}
}
public void extract() {
try {
Parser parser = new Parser();
if (false) {
SAXParser sp = SAXParserFactory.newInstance().newSAXParser();
sp.parse(new FileInputStream(wikipedia), parser);
} else {
XMLReader reader =
XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
reader.setContentHandler(parser);
reader.setErrorHandler(parser);
reader.parse(new InputSource(new FileInputStream(wikipedia)));
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public static void main(String[] args) {
if (args.length != 2) {
printUsage();
}
File wikipedia = new File(args[0]);
if (wikipedia.exists()) {
File outputDir = new File(args[1]);
outputDir.mkdirs();
ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, outputDir);
extractor.extract();
} else {
printUsage();
}
}
private static void printUsage() {
System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia <Path to Wikipedia XML file> <Output Path>");
}
}