mirror of https://github.com/apache/lucene.git
LUCENE-971: extract wikipedia documents as a doc maker directly from XML file without using intermediate one-file-per-document
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@564151 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2d954694dc
commit
d1422ebd6b
|
@ -4,6 +4,12 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
|
|||
|
||||
$Id:$
|
||||
|
||||
8/9/07
|
||||
LUCENE-971: Change enwiki tasks to a doc maker (extending
|
||||
LineDocMaker) that directly processes the Wikipedia XML and produces
|
||||
documents. Intermediate files (one per document) are no longer
|
||||
created.
|
||||
|
||||
8/1/07
|
||||
LUCENE-967: Add "ReadTokensTask" to allow for benchmarking just tokenization.
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
|
||||
<available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
|
||||
<available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
|
||||
<available file="${working.dir}/enwiki" property="enwiki.extracted"/>
|
||||
<available file="${working.dir}/enwiki.txt" property="enwiki.extracted"/>
|
||||
|
||||
</target>
|
||||
|
||||
|
@ -31,7 +31,6 @@
|
|||
<mkdir dir="temp"/>
|
||||
<antcall target="get-enwiki"/>
|
||||
<antcall target="expand-enwiki"/>
|
||||
<antcall target="extract-enwiki"/>
|
||||
</target>
|
||||
|
||||
<target name="get-enwiki" unless="enwiki.exists">
|
||||
|
@ -43,14 +42,6 @@
|
|||
<bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
|
||||
</target>
|
||||
|
||||
<target name="extract-enwiki" depends="check-files" unless="enwiki.extracted">
|
||||
<mkdir dir="${working.dir}/enwiki"/>
|
||||
<java classname="org.apache.lucene.benchmark.utils.ExtractWikipedia" maxmemory="1024M" fork="true">
|
||||
<classpath refid="run.classpath"/>
|
||||
<arg line="temp/enwiki-20070527-pages-articles.xml ${working.dir}/enwiki"/>
|
||||
</java>
|
||||
</target>
|
||||
|
||||
<target name="get-news-20" unless="20news-18828.exists">
|
||||
<get src="http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz"
|
||||
dest="temp/news20.tar.gz"/>
|
||||
|
@ -164,7 +155,7 @@
|
|||
<enable/>
|
||||
</assertions>
|
||||
<classpath refid="run.classpath"/>
|
||||
<arg line="conf/wikipedia.alg"/>
|
||||
<arg line="conf/extractWikipedia.alg"/>
|
||||
</java>
|
||||
</target>
|
||||
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
#/**
|
||||
# * Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# * contributor license agreements. See the NOTICE file distributed with
|
||||
# * this work for additional information regarding copyright ownership.
|
||||
# * The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# * (the "License"); you may not use this file except in compliance with
|
||||
# * the License. You may obtain a copy of the License at
|
||||
# *
|
||||
# * http://www.apache.org/licenses/LICENSE-2.0
|
||||
# *
|
||||
# * Unless required by applicable law or agreed to in writing, software
|
||||
# * distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# * See the License for the specific language governing permissions and
|
||||
# * limitations under the License.
|
||||
# */
|
||||
# -------------------------------------------------------------------------------------
|
||||
|
||||
#
|
||||
# This alg will process the Wikipedia documents feed to produce a
|
||||
# single file that contains all documents, one per line.
|
||||
#
|
||||
# To use this, first cd to contrib/benchmark and then run:
|
||||
#
|
||||
# ant run-task -Dtask.alg=conf/extractWikipedia.alg
|
||||
#
|
||||
# Then, to index the documents in the line file, see
|
||||
# indexLineFile.alg.
|
||||
#
|
||||
|
||||
# Where to get documents from:
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker
|
||||
docs.file=temp/enwiki-20070527-pages-articles.xml
|
||||
|
||||
# Where to write the line file output:
|
||||
line.file.out=work/enwiki.txt
|
||||
|
||||
# Stop after processing the document feed once:
|
||||
doc.maker.forever=false
|
||||
|
||||
# -------------------------------------------------------------------------------------
|
||||
|
||||
# Process all documents, appending each one to the line file:
|
||||
{WriteLineDoc() > : *
|
|
@ -0,0 +1,213 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.xml.sax.XMLReader;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
import org.xml.sax.helpers.XMLReaderFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
/**
|
||||
* A LineDocMaker which reads the uncompressed english wikipedia dump.
|
||||
*/
|
||||
public class EnwikiDocMaker extends LineDocMaker {
|
||||
|
||||
static final int TITLE = 0;
|
||||
static final int DATE = TITLE+1;
|
||||
static final int BODY = DATE+1;
|
||||
static final int LENGTH = BODY+1;
|
||||
|
||||
static final String[] months = {"JAN", "FEB", "MAR", "APR",
|
||||
"MAY", "JUN", "JUL", "AUG",
|
||||
"SEP", "OCT", "NOV", "DEC"};
|
||||
|
||||
class Parser extends DefaultHandler implements Runnable {
|
||||
|
||||
Thread t;
|
||||
|
||||
public void run() {
|
||||
|
||||
try {
|
||||
XMLReader reader =
|
||||
XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
|
||||
reader.setContentHandler(this);
|
||||
reader.setErrorHandler(this);
|
||||
while(true){
|
||||
InputSource is = new InputSource(fileIS);
|
||||
reader.parse(is);
|
||||
if (!forever) {
|
||||
synchronized(this) {
|
||||
nmde = new NoMoreDataException();
|
||||
notify();
|
||||
}
|
||||
return;
|
||||
} else {
|
||||
synchronized(this){
|
||||
openFile();
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (SAXException sae) {
|
||||
throw new RuntimeException(sae);
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Parser() {
|
||||
t = new Thread(this);
|
||||
t.setDaemon(true);
|
||||
t.start();
|
||||
}
|
||||
|
||||
String[] tuple;
|
||||
NoMoreDataException nmde;
|
||||
|
||||
String[] next() throws NoMoreDataException {
|
||||
String[] result;
|
||||
synchronized(this){
|
||||
while(tuple == null && nmde == null){
|
||||
try {
|
||||
wait();
|
||||
} catch (InterruptedException ie) {
|
||||
}
|
||||
}
|
||||
if (nmde != null) {
|
||||
throw nmde;
|
||||
}
|
||||
result = tuple;
|
||||
tuple = null;
|
||||
notify();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
StringBuffer contents = new StringBuffer();
|
||||
|
||||
public void characters(char[] ch, int start, int length) {
|
||||
contents.append(ch, start, length);
|
||||
}
|
||||
|
||||
String title;
|
||||
String body;
|
||||
String time;
|
||||
|
||||
static final int BASE = 10;
|
||||
|
||||
public void startElement(String namespace,
|
||||
String simple,
|
||||
String qualified,
|
||||
Attributes attributes) {
|
||||
if (qualified.equals("page")) {
|
||||
title = null;
|
||||
body = null;
|
||||
time = null;
|
||||
} else if (qualified.equals("text")) {
|
||||
contents.setLength(0);
|
||||
} else if (qualified.equals("timestamp")) {
|
||||
contents.setLength(0);
|
||||
} else if (qualified.equals("title")) {
|
||||
contents.setLength(0);
|
||||
}
|
||||
}
|
||||
|
||||
String time(String original) {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
|
||||
buffer.append(original.substring(8, 10));
|
||||
buffer.append('-');
|
||||
buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
|
||||
buffer.append('-');
|
||||
buffer.append(original.substring(0, 4));
|
||||
buffer.append(' ');
|
||||
buffer.append(original.substring(11, 19));
|
||||
buffer.append(".000");
|
||||
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
public void create(String title, String time, String body) {
|
||||
String[] t = new String[LENGTH];
|
||||
t[TITLE] = title.replace('\t', ' ');
|
||||
t[DATE] = time.replace('\t', ' ');
|
||||
t[BODY] = body.replaceAll("[\t\n]", " ");
|
||||
synchronized(this) {
|
||||
while(tuple!=null) {
|
||||
try {
|
||||
wait();
|
||||
} catch (InterruptedException ie) {
|
||||
}
|
||||
}
|
||||
tuple = t;
|
||||
notify();
|
||||
}
|
||||
}
|
||||
|
||||
public void endElement(String namespace, String simple, String qualified)
|
||||
throws SAXException {
|
||||
if (qualified.equals("title")) {
|
||||
title = contents.toString();
|
||||
} else if (qualified.equals("text")) {
|
||||
body = contents.toString();
|
||||
if (body.startsWith("#REDIRECT") ||
|
||||
body.startsWith("#redirect")) {
|
||||
body = null;
|
||||
}
|
||||
} else if (qualified.equals("timestamp")) {
|
||||
time = time(contents.toString());
|
||||
} else if (qualified.equals("page")) {
|
||||
if (body != null) {
|
||||
create(title, time, body);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Parser parser = new Parser();
|
||||
|
||||
class DocState extends LineDocMaker.DocState {
|
||||
public Document setFields(String[] tuple) {
|
||||
titleField.setValue(tuple[TITLE]);
|
||||
dateField.setValue(tuple[DATE]);
|
||||
bodyField.setValue(tuple[BODY]);
|
||||
return doc;
|
||||
}
|
||||
}
|
||||
|
||||
private DocState getDocState() {
|
||||
DocState ds = (DocState) docState.get();
|
||||
if (ds == null) {
|
||||
ds = new DocState();
|
||||
docState.set(ds);
|
||||
}
|
||||
return ds;
|
||||
}
|
||||
|
||||
public Document makeDocument() throws Exception {
|
||||
String[] tuple = parser.next();
|
||||
return getDocState().setFields(tuple);
|
||||
}
|
||||
|
||||
}
|
|
@ -25,7 +25,8 @@ import org.apache.lucene.document.Field;
|
|||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.FileReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
/**
|
||||
* A DocMaker reading one line at a time as a Document from
|
||||
|
@ -39,13 +40,14 @@ import java.io.FileReader;
|
|||
*/
|
||||
public class LineDocMaker extends BasicDocMaker {
|
||||
|
||||
private BufferedReader fileIn;
|
||||
private ThreadLocal docState = new ThreadLocal();
|
||||
FileInputStream fileIS;
|
||||
BufferedReader fileIn;
|
||||
ThreadLocal docState = new ThreadLocal();
|
||||
private String fileName;
|
||||
|
||||
private static int READER_BUFFER_BYTES = 64*1024;
|
||||
|
||||
private class DocState {
|
||||
|
||||
class DocState {
|
||||
Document doc;
|
||||
Field bodyField;
|
||||
Field titleField;
|
||||
|
@ -63,7 +65,7 @@ public class LineDocMaker extends BasicDocMaker {
|
|||
storeVal,
|
||||
Field.Index.TOKENIZED,
|
||||
termVecVal);
|
||||
dateField = new Field(BasicDocMaker.TITLE_FIELD,
|
||||
dateField = new Field(BasicDocMaker.DATE_FIELD,
|
||||
"",
|
||||
storeVal,
|
||||
Field.Index.TOKENIZED,
|
||||
|
@ -143,11 +145,12 @@ public class LineDocMaker extends BasicDocMaker {
|
|||
openFile();
|
||||
}
|
||||
|
||||
private void openFile() {
|
||||
void openFile() {
|
||||
try {
|
||||
if (fileIn != null)
|
||||
fileIn.close();
|
||||
fileIn = new BufferedReader(new FileReader(fileName), READER_BUFFER_BYTES);
|
||||
fileIS = new FileInputStream(fileName);
|
||||
fileIn = new BufferedReader(new InputStreamReader(fileIS,"UTF-8"), READER_BUFFER_BYTES);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
|
|
@ -18,7 +18,8 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
*/
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.FileWriter;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
|
@ -59,7 +60,7 @@ public class WriteLineDocTask extends PerfTask {
|
|||
String fileName = config.get("line.file.out", null);
|
||||
if (fileName == null)
|
||||
throw new Exception("line.file.out must be set");
|
||||
lineFileOut = new BufferedWriter(new FileWriter(fileName));
|
||||
lineFileOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName),"UTF-8"));
|
||||
}
|
||||
docMaker = getRunData().getDocMaker();
|
||||
}
|
||||
|
|
|
@ -1,211 +0,0 @@
|
|||
package org.apache.lucene.benchmark.utils;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.XMLReader;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
import org.xml.sax.helpers.XMLReaderFactory;
|
||||
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Extract the downloaded Wikipedia dump into separate files for indexing.
|
||||
*/
|
||||
public class ExtractWikipedia {
|
||||
|
||||
private File wikipedia;
|
||||
private File outputDir;
|
||||
|
||||
public ExtractWikipedia(File wikipedia, File outputDir) {
|
||||
this.wikipedia = wikipedia;
|
||||
this.outputDir = outputDir;
|
||||
System.out.println("Deleting all files in " + outputDir);
|
||||
File [] files = outputDir.listFiles();
|
||||
for (int i = 0; i < files.length; i++) {
|
||||
files[i].delete();
|
||||
}
|
||||
}
|
||||
|
||||
static public int count = 0;
|
||||
static String[] months = {"JAN", "FEB", "MAR", "APR",
|
||||
"MAY", "JUN", "JUL", "AUG",
|
||||
"SEP", "OCT", "NOV", "DEC"};
|
||||
|
||||
public class Parser extends DefaultHandler {
|
||||
|
||||
public Parser() {
|
||||
}
|
||||
|
||||
StringBuffer contents = new StringBuffer();
|
||||
|
||||
public void characters(char[] ch, int start, int length) {
|
||||
contents.append(ch, start, length);
|
||||
}
|
||||
|
||||
String title;
|
||||
String id;
|
||||
String body;
|
||||
String time;
|
||||
|
||||
static final int BASE = 10;
|
||||
|
||||
public void startElement(String namespace,
|
||||
String simple,
|
||||
String qualified,
|
||||
Attributes attributes) {
|
||||
if (qualified.equals("page")) {
|
||||
title = null;
|
||||
id = null;
|
||||
body = null;
|
||||
time = null;
|
||||
} else if (qualified.equals("text")) {
|
||||
contents.setLength(0);
|
||||
} else if (qualified.equals("timestamp")) {
|
||||
contents.setLength(0);
|
||||
} else if (qualified.equals("title")) {
|
||||
contents.setLength(0);
|
||||
} else if (qualified.equals("id")) {
|
||||
contents.setLength(0);
|
||||
}
|
||||
}
|
||||
|
||||
public File directory (int count, File directory) {
|
||||
if (directory == null) {
|
||||
directory = outputDir;
|
||||
}
|
||||
int base = BASE;
|
||||
while (base <= count) {
|
||||
base *= BASE;
|
||||
}
|
||||
if (count < BASE) {
|
||||
return directory;
|
||||
}
|
||||
directory = new File (directory, (Integer.toString(base / BASE)));
|
||||
directory = new File (directory, (Integer.toString(count / (base / BASE))));
|
||||
return directory(count % (base / BASE), directory);
|
||||
}
|
||||
|
||||
public void create(String id, String title, String time, String body) {
|
||||
|
||||
File d = directory(count++, null);
|
||||
d.mkdirs();
|
||||
File f = new File(d, id + ".txt");
|
||||
|
||||
StringBuffer contents = new StringBuffer();
|
||||
|
||||
contents.append(time);
|
||||
contents.append("\n\n");
|
||||
contents.append(title);
|
||||
contents.append("\n\n");
|
||||
contents.append(body);
|
||||
contents.append("\n");
|
||||
|
||||
try {
|
||||
FileWriter writer = new FileWriter(f);
|
||||
writer.write(contents.toString());
|
||||
writer.close();
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
String time(String original) {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
|
||||
buffer.append(original.substring(8, 10));
|
||||
buffer.append('-');
|
||||
buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
|
||||
buffer.append('-');
|
||||
buffer.append(original.substring(0, 4));
|
||||
buffer.append(' ');
|
||||
buffer.append(original.substring(11, 19));
|
||||
buffer.append(".000");
|
||||
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
public void endElement(String namespace, String simple, String qualified) {
|
||||
if (qualified.equals("title")) {
|
||||
title = contents.toString();
|
||||
} else if (qualified.equals("text")) {
|
||||
body = contents.toString();
|
||||
if (body.startsWith("#REDIRECT") ||
|
||||
body.startsWith("#redirect")) {
|
||||
body = null;
|
||||
}
|
||||
} else if (qualified.equals("timestamp")) {
|
||||
time = time(contents.toString());
|
||||
} else if (qualified.equals("id") && id == null) {
|
||||
id = contents.toString();
|
||||
} else if (qualified.equals("page")) {
|
||||
if (body != null) {
|
||||
create(id, title, time, body);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void extract() {
|
||||
|
||||
try {
|
||||
Parser parser = new Parser();
|
||||
if (false) {
|
||||
SAXParser sp = SAXParserFactory.newInstance().newSAXParser();
|
||||
sp.parse(new FileInputStream(wikipedia), parser);
|
||||
} else {
|
||||
XMLReader reader =
|
||||
XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
|
||||
reader.setContentHandler(parser);
|
||||
reader.setErrorHandler(parser);
|
||||
reader.parse(new InputSource(new FileInputStream(wikipedia)));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
if (args.length != 2) {
|
||||
printUsage();
|
||||
}
|
||||
|
||||
File wikipedia = new File(args[0]);
|
||||
|
||||
if (wikipedia.exists()) {
|
||||
File outputDir = new File(args[1]);
|
||||
outputDir.mkdirs();
|
||||
ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, outputDir);
|
||||
extractor.extract();
|
||||
} else {
|
||||
printUsage();
|
||||
}
|
||||
}
|
||||
|
||||
private static void printUsage() {
|
||||
System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia <Path to Wikipedia XML file> <Output Path>");
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue