LUCENE-848. Add Wikipedia benchmarking support

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@552229 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2007-07-01 02:19:10 +00:00
parent 9ff9bf8142
commit bc7c586468
11 changed files with 579 additions and 29 deletions

View File

@ -0,0 +1,2 @@
- /work
- /temp

View File

@ -4,6 +4,9 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$
6/30/07
LUCENE-848: Added support for Wikipedia benchmarking.
6/25/07
- LUCENE-940: Multi-threaded issues fixed: SimpleDateFormat; logging for addDoc/deleteDoc tasks.
- LUCENE-945: tests fail to find data dirs. Added sys-prop benchmark.work.dir and cfg-prop work.dir.

View File

@ -0,0 +1,22 @@
Support exists for downloading, parsing, and loading the English
version of wikipedia (enwiki).
The build file can automatically try to download the most current
enwiki dataset (pages-articles.xml.bz2) from the "latest" directory,
http://download.wikimedia.org/enwiki/latest/. However, this file
doesn't always exist, depending on where wikipedia is in the dump
process and whether prior dumps have succeeded. If this file doesn't
exist, you can sometimes find an older or in progress version by
looking in the dated directories under
http://download.wikimedia.org/enwiki/. For example, as of this
writing, there is a page file in
http://download.wikimedia.org/enwiki/20070402/. You can download this
file manually and put it in temp. Note that the file you download will
probably have the date in the name, e.g.,
http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2. When
you put it in temp, rename it to enwiki-latest-pages-articles.xml.bz2.
After that, ant enwiki should process the data set and run a load
test. Ant targets get-enwiki, expand-enwiki, and extract-enwiki can
also be used to download, decompress, and extract (to individual files
in work/enwiki) the dataset, respectively.

View File

@ -1,22 +1,4 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="benchmark" default="default">
<description>
@ -39,6 +21,34 @@
<available file="${working.dir}/20news-18828" property="20news-18828.expanded"/>
<available file="${working.dir}/mini_newsgroups" property="mini.expanded"/>
<available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
<available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
<available file="${working.dir}/enwiki" property="enwiki.extracted"/>
</target>
<target name="enwiki-files" depends="check-files">
<mkdir dir="temp"/>
<antcall target="get-enwiki"/>
<antcall target="expand-enwiki"/>
<antcall target="extract-enwiki"/>
</target>
<target name="get-enwiki" unless="enwiki.exists">
<get src="http://people.apache.org/~gsingers/wikipedia/enwiki-20070527-pages-articles.xml.bz2"
dest="temp/enwiki-20070527-pages-articles.xml.bz2"/>
</target>
<target name="expand-enwiki" unless="enwiki.expanded">
<bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
</target>
<target name="extract-enwiki" depends="check-files" unless="enwiki.extracted">
<mkdir dir="${working.dir}/enwiki"/>
<java classname="org.apache.lucene.benchmark.utils.ExtractWikipedia" maxmemory="1024M" fork="true">
<classpath refid="run.classpath"/>
<arg line="temp/enwiki-20070527-pages-articles.xml ${working.dir}/enwiki"/>
</java>
</target>
<target name="get-news-20" unless="20news-18828.exists">
@ -102,6 +112,8 @@
<property name="collections.jar" value="commons-collections-3.1.jar"/>
<property name="logging.jar" value="commons-logging-1.0.4.jar"/>
<property name="bean-utils.jar" value="commons-beanutils-1.7.0.jar"/>
<property name="xercesImpl.jar" value="xerces-2.9.0.jar"/>
<property name="xml-apis.jar" value="xml-apis-2.9.0.jar"/>
<path id="classpath">
<pathelement path="${common.dir}/build/classes/java"/>
@ -110,6 +122,8 @@
<pathelement path="${basedir}/lib/${collections.jar}"/>
<pathelement path="${basedir}/lib/${logging.jar}"/>
<pathelement path="${basedir}/lib/${bean-utils.jar}"/>
<pathelement path="${basedir}/lib/${xercesImpl.jar}"/>
<pathelement path="${basedir}/lib/${xml-apis.jar}"/>
</path>
<path id="run.classpath">
<path refid="classpath"/>
@ -143,13 +157,24 @@
</java>
</target>
<target name="enwiki" depends="compile,check-files,enwiki-files">
<echo>Working Directory: ${working.dir}</echo>
<java classname="org.apache.lucene.benchmark.byTask.Benchmark" maxmemory="1024M" fork="true">
<assertions>
<enable/>
</assertions>
<classpath refid="run.classpath"/>
<arg line="conf/wikipedia.alg"/>
</java>
</target>
<target name="compile-demo">
<subant target="compile-demo">
<fileset dir="${common.dir}" includes="build.xml"/>
</subant>
</target>
<target name="init" depends="contrib-build.init,compile-demo,check-files"/>
<target name="init" depends="common.init,compile-demo,check-files"/>
<!-- make sure online collections (reuters) are first downloaded -->
<target name="test" depends="init,get-files">

View File

@ -0,0 +1,65 @@
#/**
# * Licensed to the Apache Software Foundation (ASF) under one or more
# * contributor license agreements. See the NOTICE file distributed with
# * this work for additional information regarding copyright ownership.
# * The ASF licenses this file to You under the Apache License, Version 2.0
# * (the "License"); you may not use this file except in compliance with
# * the License. You may obtain a copy of the License at
# *
# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# * See the License for the specific language governing permissions and
# * limitations under the License.
# */
# -------------------------------------------------------------------------------------
# multi val params are iterated by NewRound's, added to reports, start with column name.
#
# based on micro-standard
#
# modified to use wikipedia sources and index entire docs
# currently just used to measure ingest rate
merge.factor=mrg:10:100:10:100
max.field.length=2147483647
max.buffered=buf:10:10:100:100
compound=true
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
directory=FSDirectory
doc.stored=true
doc.tokenized=true
doc.term.vector=false
doc.add.log.step=500
docs.dir=enwiki
doc.maker=org.apache.lucene.benchmark.byTask.feeds.DirDocMaker
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
# task at this depth or less would print when they start
task.max.depth.log=2
log.queries=false
# -------------------------------------------------------------------------------------
{ "Rounds"
ResetSystemErase
{ "Populate"
CreateIndex
{ "MAddDocs" AddDoc > : 200000
CloseIndex
}
NewRound
} : 8
RepSumByName
RepSumByPrefRound MAddDocs

View File

@ -0,0 +1,2 @@
AnyObjectId[99ee39d5be4f9700474691d8a5ed0a5058e27f7b] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[d42c0ea6cfd17ed6b444b8337febbc0bdb55ed83] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,210 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.benchmark.byTask.utils.Config;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.Locale;
import java.util.Stack;
/**
* A DocMaker using the Dir collection for its input.
*
* Config properties:
* docs.dir=&lt;path to the docs dir| Default: dir-out&gt;
*
*/
public class DirDocMaker extends BasicDocMaker {
private DateFormat dateFormat;
private File dataDir = null;
private int iteration=0;
static public class Iterator implements java.util.Iterator {
int count = 0;
public int getCount(){
return count;
}
Stack stack = new Stack();
/* this seems silly ... there must be a better way ...
not that this is good, but can it matter? */
static class Comparator implements java.util.Comparator {
public int compare(Object _a, Object _b) {
String a = _a.toString();
String b = _b.toString();
int diff = a.length() - b.length();
if (diff > 0) {
while (diff-- > 0) {
b = "0" + b;
}
} else if (diff < 0) {
diff = -diff;
while (diff-- > 0) {
a = "0" + a;
}
}
/* note it's reversed because we're going to push,
which reverses again */
return b.compareTo(a);
}
}
Comparator c = new Comparator();
void push(File[] files) {
Arrays.sort(files, c);
for(int i = 0; i < files.length; i++) {
// System.err.println("push " + files[i]);
stack.push(files[i]);
}
}
void push(File f) {
push(f.listFiles(new FileFilter() {
public boolean accept(File f) { return f.isDirectory(); } }));
push(f.listFiles(new FileFilter() {
public boolean accept(File f) { return f.getName().endsWith(".txt"); } }));
find();
}
void find() {
if (stack.empty()) {
return;
}
if (!((File)stack.peek()).isDirectory()) {
return;
}
File f = (File)stack.pop();
push(f);
}
public Iterator(File f) {
push(f);
}
public void remove() {
throw new RuntimeException("cannot");
}
public boolean hasNext() {
return stack.size() > 0;
}
public Object next() {
assert hasNext();
count++;
Object object = stack.pop();
// System.err.println("pop " + object);
find();
return object;
}
}
private Iterator inputFiles = null;
/* (non-Javadoc)
* @see SimpleDocMaker#setConfig(java.util.Properties)
*/
public void setConfig(Config config) {
super.setConfig(config);
String d = config.get("docs.dir", "dir-out");
dataDir = new File(new File("work"), d);
inputFiles = new Iterator(dataDir);
if (inputFiles==null) {
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
}
// date format: 30-MAR-1987 14:22:36
dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss",Locale.US);
dateFormat.setLenient(true);
}
protected DocData getNextDocData() throws Exception {
File f = null;
String name = null;
synchronized (this) {
if (!inputFiles.hasNext()) {
// exhausted files, start a new round, unless forever set to false.
if (!forever) {
throw new NoMoreDataException();
}
inputFiles = new Iterator(dataDir);
iteration++;
}
f = (File) inputFiles.next();
// System.err.println(f);
name = f.getCanonicalPath()+"_"+iteration;
}
BufferedReader reader = new BufferedReader(new FileReader(f));
String line = null;
//First line is the date, 3rd is the title, rest is body
String dateStr = reader.readLine();
reader.readLine();//skip an empty line
String title = reader.readLine();
reader.readLine();//skip an empty line
StringBuffer bodyBuf = new StringBuffer(1024);
while ((line = reader.readLine()) != null) {
bodyBuf.append(line).append(' ');
}
reader.close();
addBytes(f.length());
Date date = dateFormat.parse(dateStr.trim());
return new DocData(name, bodyBuf.toString(), title, null, date);
}
/*
* (non-Javadoc)
* @see DocMaker#resetIinputs()
*/
public synchronized void resetInputs() {
super.resetInputs();
inputFiles = new Iterator(dataDir);
iteration = 0;
}
/*
* (non-Javadoc)
* @see DocMaker#numUniqueTexts()
*/
public int numUniqueTexts() {
return inputFiles.getCount();
}
}

View File

@ -17,19 +17,20 @@ package org.apache.lucene.benchmark.byTask.tasks;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import java.io.IOException;
/**
* Create an index.
* <br>Other side effects: index writer object in perfRunData is set.
* <br>Relevant properties: <code>merge.factor , max.buffered</code>.
* <br>Relevant properties: <code>merge.factor, max.buffered,
* max.field.length</code>.
*/
public class CreateIndexTask extends PerfTask {
@ -48,10 +49,12 @@ public class CreateIndexTask extends PerfTask {
boolean cmpnd = config.get("compound",true);
int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR);
int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED);
int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH);
iw.setUseCompoundFile(cmpnd);
iw.setMergeFactor(mrgf);
iw.setMaxBufferedDocs(mxbf);
iw.setMaxFieldLength(mxfl);
getRunData().setIndexWriter(iw);
return 1;

View File

@ -17,23 +17,26 @@ package org.apache.lucene.benchmark.byTask.tasks;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import java.io.IOException;
/**
* Open an index writer.
* <br>Other side effects: index writer object in perfRunData is set.
* <br>Relevant properties: <code>merge.factor , max.buffered</code>.
* <br>Relevant properties: <code>merge.factor, max.buffered,
* max.field.length</code>.
</code>.
*/
public class OpenIndexTask extends PerfTask {
public static final int DEFAULT_MAX_BUFFERED = 10;
public static final int DEFAULT_MAX_FIELD_LENGTH = 10000;
public static final int DEFAULT_MERGE_PFACTOR = 10;
public OpenIndexTask(PerfRunData runData) {
@ -50,9 +53,11 @@ public class OpenIndexTask extends PerfTask {
boolean cmpnd = config.get("compound",true);
int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR);
int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED);
int mxfl = config.get("max.field.length",DEFAULT_MAX_FIELD_LENGTH);
// must update params for newly opened writer
writer.setMaxBufferedDocs(mxbf);
writer.setMaxFieldLength(mxfl);
writer.setMergeFactor(mrgf);
writer.setUseCompoundFile(cmpnd); // this one redundant?

View File

@ -0,0 +1,211 @@
package org.apache.lucene.benchmark.utils;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
/**
* Extract the downloaded Wikipedia dump into separate files for indexing.
*/
public class ExtractWikipedia {
private File wikipedia;
private File outputDir;
public ExtractWikipedia(File wikipedia, File outputDir) {
this.wikipedia = wikipedia;
this.outputDir = outputDir;
System.out.println("Deleting all files in " + outputDir);
File [] files = outputDir.listFiles();
for (int i = 0; i < files.length; i++) {
files[i].delete();
}
}
static public int count = 0;
static String[] months = {"JAN", "FEB", "MAR", "APR",
"MAY", "JUN", "JUL", "AUG",
"SEP", "OCT", "NOV", "DEC"};
public class Parser extends DefaultHandler {
public Parser() {
}
StringBuffer contents = new StringBuffer();
public void characters(char[] ch, int start, int length) {
contents.append(ch, start, length);
}
String title;
String id;
String body;
String time;
static final int BASE = 10;
public void startElement(String namespace,
String simple,
String qualified,
Attributes attributes) {
if (qualified.equals("page")) {
title = null;
id = null;
body = null;
time = null;
} else if (qualified.equals("text")) {
contents.setLength(0);
} else if (qualified.equals("timestamp")) {
contents.setLength(0);
} else if (qualified.equals("title")) {
contents.setLength(0);
} else if (qualified.equals("id")) {
contents.setLength(0);
}
}
public File directory (int count, File directory) {
if (directory == null) {
directory = outputDir;
}
int base = BASE;
while (base <= count) {
base *= BASE;
}
if (count < BASE) {
return directory;
}
directory = new File (directory, (Integer.toString(base / BASE)));
directory = new File (directory, (Integer.toString(count / (base / BASE))));
return directory(count % (base / BASE), directory);
}
public void create(String id, String title, String time, String body) {
File d = directory(count++, null);
d.mkdirs();
File f = new File(d, id + ".txt");
StringBuffer contents = new StringBuffer();
contents.append(time);
contents.append("\n\n");
contents.append(title);
contents.append("\n\n");
contents.append(body);
contents.append("\n");
try {
FileWriter writer = new FileWriter(f);
writer.write(contents.toString());
writer.close();
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
String time(String original) {
StringBuffer buffer = new StringBuffer();
buffer.append(original.substring(8, 10));
buffer.append('-');
buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
buffer.append('-');
buffer.append(original.substring(0, 4));
buffer.append(' ');
buffer.append(original.substring(11, 19));
buffer.append(".000");
return buffer.toString();
}
public void endElement(String namespace, String simple, String qualified) {
if (qualified.equals("title")) {
title = contents.toString();
} else if (qualified.equals("text")) {
body = contents.toString();
if (body.startsWith("#REDIRECT") ||
body.startsWith("#redirect")) {
body = null;
}
} else if (qualified.equals("timestamp")) {
time = time(contents.toString());
} else if (qualified.equals("id") && id == null) {
id = contents.toString();
} else if (qualified.equals("page")) {
if (body != null) {
create(id, title, time, body);
}
}
}
}
public void extract() {
try {
Parser parser = new Parser();
if (false) {
SAXParser sp = SAXParserFactory.newInstance().newSAXParser();
sp.parse(new FileInputStream(wikipedia), parser);
} else {
XMLReader reader =
XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
reader.setContentHandler(parser);
reader.setErrorHandler(parser);
reader.parse(new InputSource(new FileInputStream(wikipedia)));
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public static void main(String[] args) {
if (args.length != 2) {
printUsage();
}
File wikipedia = new File(args[0]);
if (wikipedia.exists()) {
File outputDir = new File(args[1]);
outputDir.mkdirs();
ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, outputDir);
extractor.extract();
} else {
printUsage();
}
}
private static void printUsage() {
System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia <Path to Wikipedia XML file> <Output Path>");
}
}