mirror of https://github.com/apache/lucene.git
LUCENE-848. Add Wikipedia benchmarking support
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@552229 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9ff9bf8142
commit
bc7c586468
|
@ -0,0 +1,2 @@
|
||||||
|
- /work
|
||||||
|
- /temp
|
|
@ -4,6 +4,9 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
|
||||||
|
|
||||||
$Id:$
|
$Id:$
|
||||||
|
|
||||||
|
6/30/07
|
||||||
|
LUCENE-848: Added support for Wikipedia benchmarking.
|
||||||
|
|
||||||
6/25/07
|
6/25/07
|
||||||
- LUCENE-940: Multi-threaded issues fixed: SimpleDateFormat; logging for addDoc/deleteDoc tasks.
|
- LUCENE-940: Multi-threaded issues fixed: SimpleDateFormat; logging for addDoc/deleteDoc tasks.
|
||||||
- LUCENE-945: tests fail to find data dirs. Added sys-prop benchmark.work.dir and cfg-prop work.dir.
|
- LUCENE-945: tests fail to find data dirs. Added sys-prop benchmark.work.dir and cfg-prop work.dir.
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
Support exists for downloading, parsing, and loading the English
|
||||||
|
version of wikipedia (enwiki).
|
||||||
|
|
||||||
|
The build file can automatically try to download the most current
|
||||||
|
enwiki dataset (pages-articles.xml.bz2) from the "latest" directory,
|
||||||
|
http://download.wikimedia.org/enwiki/latest/. However, this file
|
||||||
|
doesn't always exist, depending on where wikipedia is in the dump
|
||||||
|
process and whether prior dumps have succeeded. If this file doesn't
|
||||||
|
exist, you can sometimes find an older or in progress version by
|
||||||
|
looking in the dated directories under
|
||||||
|
http://download.wikimedia.org/enwiki/. For example, as of this
|
||||||
|
writing, there is a page file in
|
||||||
|
http://download.wikimedia.org/enwiki/20070402/. You can download this
|
||||||
|
file manually and put it in temp. Note that the file you download will
|
||||||
|
probably have the date in the name, e.g.,
|
||||||
|
http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2. When
|
||||||
|
you put it in temp, rename it to enwiki-latest-pages-articles.xml.bz2.
|
||||||
|
|
||||||
|
After that, ant enwiki should process the data set and run a load
|
||||||
|
test. Ant targets get-enwiki, expand-enwiki, and extract-enwiki can
|
||||||
|
also be used to download, decompress, and extract (to individual files
|
||||||
|
in work/enwiki) the dataset, respectively.
|
|
@ -1,22 +1,4 @@
|
||||||
<?xml version="1.0"?>
|
<?xml version="1.0"?>
|
||||||
|
|
||||||
<!--
|
|
||||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
contributor license agreements. See the NOTICE file distributed with
|
|
||||||
this work for additional information regarding copyright ownership.
|
|
||||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
-->
|
|
||||||
|
|
||||||
<project name="benchmark" default="default">
|
<project name="benchmark" default="default">
|
||||||
|
|
||||||
<description>
|
<description>
|
||||||
|
@ -39,6 +21,34 @@
|
||||||
<available file="${working.dir}/20news-18828" property="20news-18828.expanded"/>
|
<available file="${working.dir}/20news-18828" property="20news-18828.expanded"/>
|
||||||
<available file="${working.dir}/mini_newsgroups" property="mini.expanded"/>
|
<available file="${working.dir}/mini_newsgroups" property="mini.expanded"/>
|
||||||
|
|
||||||
|
<available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
|
||||||
|
<available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
|
||||||
|
<available file="${working.dir}/enwiki" property="enwiki.extracted"/>
|
||||||
|
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="enwiki-files" depends="check-files">
|
||||||
|
<mkdir dir="temp"/>
|
||||||
|
<antcall target="get-enwiki"/>
|
||||||
|
<antcall target="expand-enwiki"/>
|
||||||
|
<antcall target="extract-enwiki"/>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="get-enwiki" unless="enwiki.exists">
|
||||||
|
<get src="http://people.apache.org/~gsingers/wikipedia/enwiki-20070527-pages-articles.xml.bz2"
|
||||||
|
dest="temp/enwiki-20070527-pages-articles.xml.bz2"/>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="expand-enwiki" unless="enwiki.expanded">
|
||||||
|
<bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="extract-enwiki" depends="check-files" unless="enwiki.extracted">
|
||||||
|
<mkdir dir="${working.dir}/enwiki"/>
|
||||||
|
<java classname="org.apache.lucene.benchmark.utils.ExtractWikipedia" maxmemory="1024M" fork="true">
|
||||||
|
<classpath refid="run.classpath"/>
|
||||||
|
<arg line="temp/enwiki-20070527-pages-articles.xml ${working.dir}/enwiki"/>
|
||||||
|
</java>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="get-news-20" unless="20news-18828.exists">
|
<target name="get-news-20" unless="20news-18828.exists">
|
||||||
|
@ -102,6 +112,8 @@
|
||||||
<property name="collections.jar" value="commons-collections-3.1.jar"/>
|
<property name="collections.jar" value="commons-collections-3.1.jar"/>
|
||||||
<property name="logging.jar" value="commons-logging-1.0.4.jar"/>
|
<property name="logging.jar" value="commons-logging-1.0.4.jar"/>
|
||||||
<property name="bean-utils.jar" value="commons-beanutils-1.7.0.jar"/>
|
<property name="bean-utils.jar" value="commons-beanutils-1.7.0.jar"/>
|
||||||
|
<property name="xercesImpl.jar" value="xerces-2.9.0.jar"/>
|
||||||
|
<property name="xml-apis.jar" value="xml-apis-2.9.0.jar"/>
|
||||||
|
|
||||||
<path id="classpath">
|
<path id="classpath">
|
||||||
<pathelement path="${common.dir}/build/classes/java"/>
|
<pathelement path="${common.dir}/build/classes/java"/>
|
||||||
|
@ -110,6 +122,8 @@
|
||||||
<pathelement path="${basedir}/lib/${collections.jar}"/>
|
<pathelement path="${basedir}/lib/${collections.jar}"/>
|
||||||
<pathelement path="${basedir}/lib/${logging.jar}"/>
|
<pathelement path="${basedir}/lib/${logging.jar}"/>
|
||||||
<pathelement path="${basedir}/lib/${bean-utils.jar}"/>
|
<pathelement path="${basedir}/lib/${bean-utils.jar}"/>
|
||||||
|
<pathelement path="${basedir}/lib/${xercesImpl.jar}"/>
|
||||||
|
<pathelement path="${basedir}/lib/${xml-apis.jar}"/>
|
||||||
</path>
|
</path>
|
||||||
<path id="run.classpath">
|
<path id="run.classpath">
|
||||||
<path refid="classpath"/>
|
<path refid="classpath"/>
|
||||||
|
@ -143,13 +157,24 @@
|
||||||
</java>
|
</java>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
<target name="enwiki" depends="compile,check-files,enwiki-files">
|
||||||
|
<echo>Working Directory: ${working.dir}</echo>
|
||||||
|
<java classname="org.apache.lucene.benchmark.byTask.Benchmark" maxmemory="1024M" fork="true">
|
||||||
|
<assertions>
|
||||||
|
<enable/>
|
||||||
|
</assertions>
|
||||||
|
<classpath refid="run.classpath"/>
|
||||||
|
<arg line="conf/wikipedia.alg"/>
|
||||||
|
</java>
|
||||||
|
</target>
|
||||||
|
|
||||||
<target name="compile-demo">
|
<target name="compile-demo">
|
||||||
<subant target="compile-demo">
|
<subant target="compile-demo">
|
||||||
<fileset dir="${common.dir}" includes="build.xml"/>
|
<fileset dir="${common.dir}" includes="build.xml"/>
|
||||||
</subant>
|
</subant>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="init" depends="contrib-build.init,compile-demo,check-files"/>
|
<target name="init" depends="common.init,compile-demo,check-files"/>
|
||||||
|
|
||||||
<!-- make sure online collections (reuters) are first downloaded -->
|
<!-- make sure online collections (reuters) are first downloaded -->
|
||||||
<target name="test" depends="init,get-files">
|
<target name="test" depends="init,get-files">
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
#/**
|
||||||
|
# * Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# * contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# * this work for additional information regarding copyright ownership.
|
||||||
|
# * The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# * (the "License"); you may not use this file except in compliance with
|
||||||
|
# * the License. You may obtain a copy of the License at
|
||||||
|
# *
|
||||||
|
# * http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# *
|
||||||
|
# * Unless required by applicable law or agreed to in writing, software
|
||||||
|
# * distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# * See the License for the specific language governing permissions and
|
||||||
|
# * limitations under the License.
|
||||||
|
# */
|
||||||
|
# -------------------------------------------------------------------------------------
|
||||||
|
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||||
|
#
|
||||||
|
# based on micro-standard
|
||||||
|
#
|
||||||
|
# modified to use wikipedia sources and index entire docs
|
||||||
|
# currently just used to measure ingest rate
|
||||||
|
|
||||||
|
merge.factor=mrg:10:100:10:100
|
||||||
|
max.field.length=2147483647
|
||||||
|
max.buffered=buf:10:10:100:100
|
||||||
|
compound=true
|
||||||
|
|
||||||
|
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
|
||||||
|
directory=FSDirectory
|
||||||
|
|
||||||
|
doc.stored=true
|
||||||
|
doc.tokenized=true
|
||||||
|
doc.term.vector=false
|
||||||
|
doc.add.log.step=500
|
||||||
|
|
||||||
|
docs.dir=enwiki
|
||||||
|
|
||||||
|
doc.maker=org.apache.lucene.benchmark.byTask.feeds.DirDocMaker
|
||||||
|
|
||||||
|
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||||
|
|
||||||
|
# task at this depth or less would print when they start
|
||||||
|
task.max.depth.log=2
|
||||||
|
|
||||||
|
log.queries=false
|
||||||
|
# -------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
{ "Rounds"
|
||||||
|
|
||||||
|
ResetSystemErase
|
||||||
|
|
||||||
|
{ "Populate"
|
||||||
|
CreateIndex
|
||||||
|
{ "MAddDocs" AddDoc > : 200000
|
||||||
|
CloseIndex
|
||||||
|
}
|
||||||
|
|
||||||
|
NewRound
|
||||||
|
|
||||||
|
} : 8
|
||||||
|
|
||||||
|
RepSumByName
|
||||||
|
RepSumByPrefRound MAddDocs
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[99ee39d5be4f9700474691d8a5ed0a5058e27f7b] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[d42c0ea6cfd17ed6b444b8337febbc0bdb55ed83] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,210 @@
|
||||||
|
package org.apache.lucene.benchmark.byTask.feeds;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileFilter;
|
||||||
|
import java.io.FileReader;
|
||||||
|
import java.text.DateFormat;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Stack;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A DocMaker using the Dir collection for its input.
|
||||||
|
*
|
||||||
|
* Config properties:
|
||||||
|
* docs.dir=<path to the docs dir| Default: dir-out>
|
||||||
|
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class DirDocMaker extends BasicDocMaker {
|
||||||
|
|
||||||
|
private DateFormat dateFormat;
|
||||||
|
private File dataDir = null;
|
||||||
|
private int iteration=0;
|
||||||
|
|
||||||
|
static public class Iterator implements java.util.Iterator {
|
||||||
|
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
|
public int getCount(){
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
Stack stack = new Stack();
|
||||||
|
|
||||||
|
/* this seems silly ... there must be a better way ...
|
||||||
|
not that this is good, but can it matter? */
|
||||||
|
|
||||||
|
static class Comparator implements java.util.Comparator {
|
||||||
|
public int compare(Object _a, Object _b) {
|
||||||
|
String a = _a.toString();
|
||||||
|
String b = _b.toString();
|
||||||
|
|
||||||
|
int diff = a.length() - b.length();
|
||||||
|
|
||||||
|
if (diff > 0) {
|
||||||
|
while (diff-- > 0) {
|
||||||
|
b = "0" + b;
|
||||||
|
}
|
||||||
|
} else if (diff < 0) {
|
||||||
|
diff = -diff;
|
||||||
|
while (diff-- > 0) {
|
||||||
|
a = "0" + a;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* note it's reversed because we're going to push,
|
||||||
|
which reverses again */
|
||||||
|
return b.compareTo(a);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Comparator c = new Comparator();
|
||||||
|
|
||||||
|
void push(File[] files) {
|
||||||
|
Arrays.sort(files, c);
|
||||||
|
for(int i = 0; i < files.length; i++) {
|
||||||
|
// System.err.println("push " + files[i]);
|
||||||
|
stack.push(files[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void push(File f) {
|
||||||
|
push(f.listFiles(new FileFilter() {
|
||||||
|
public boolean accept(File f) { return f.isDirectory(); } }));
|
||||||
|
push(f.listFiles(new FileFilter() {
|
||||||
|
public boolean accept(File f) { return f.getName().endsWith(".txt"); } }));
|
||||||
|
find();
|
||||||
|
}
|
||||||
|
|
||||||
|
void find() {
|
||||||
|
if (stack.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!((File)stack.peek()).isDirectory()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
File f = (File)stack.pop();
|
||||||
|
push(f);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Iterator(File f) {
|
||||||
|
push(f);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void remove() {
|
||||||
|
throw new RuntimeException("cannot");
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasNext() {
|
||||||
|
return stack.size() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object next() {
|
||||||
|
assert hasNext();
|
||||||
|
count++;
|
||||||
|
Object object = stack.pop();
|
||||||
|
// System.err.println("pop " + object);
|
||||||
|
find();
|
||||||
|
return object;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private Iterator inputFiles = null;
|
||||||
|
|
||||||
|
/* (non-Javadoc)
|
||||||
|
* @see SimpleDocMaker#setConfig(java.util.Properties)
|
||||||
|
*/
|
||||||
|
public void setConfig(Config config) {
|
||||||
|
super.setConfig(config);
|
||||||
|
String d = config.get("docs.dir", "dir-out");
|
||||||
|
dataDir = new File(new File("work"), d);
|
||||||
|
|
||||||
|
inputFiles = new Iterator(dataDir);
|
||||||
|
|
||||||
|
if (inputFiles==null) {
|
||||||
|
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
|
||||||
|
}
|
||||||
|
// date format: 30-MAR-1987 14:22:36
|
||||||
|
dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss",Locale.US);
|
||||||
|
dateFormat.setLenient(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected DocData getNextDocData() throws Exception {
|
||||||
|
File f = null;
|
||||||
|
String name = null;
|
||||||
|
synchronized (this) {
|
||||||
|
if (!inputFiles.hasNext()) {
|
||||||
|
// exhausted files, start a new round, unless forever set to false.
|
||||||
|
if (!forever) {
|
||||||
|
throw new NoMoreDataException();
|
||||||
|
}
|
||||||
|
inputFiles = new Iterator(dataDir);
|
||||||
|
iteration++;
|
||||||
|
}
|
||||||
|
f = (File) inputFiles.next();
|
||||||
|
// System.err.println(f);
|
||||||
|
name = f.getCanonicalPath()+"_"+iteration;
|
||||||
|
}
|
||||||
|
|
||||||
|
BufferedReader reader = new BufferedReader(new FileReader(f));
|
||||||
|
String line = null;
|
||||||
|
//First line is the date, 3rd is the title, rest is body
|
||||||
|
String dateStr = reader.readLine();
|
||||||
|
reader.readLine();//skip an empty line
|
||||||
|
String title = reader.readLine();
|
||||||
|
reader.readLine();//skip an empty line
|
||||||
|
StringBuffer bodyBuf = new StringBuffer(1024);
|
||||||
|
while ((line = reader.readLine()) != null) {
|
||||||
|
bodyBuf.append(line).append(' ');
|
||||||
|
}
|
||||||
|
reader.close();
|
||||||
|
addBytes(f.length());
|
||||||
|
|
||||||
|
Date date = dateFormat.parse(dateStr.trim());
|
||||||
|
return new DocData(name, bodyBuf.toString(), title, null, date);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* (non-Javadoc)
|
||||||
|
* @see DocMaker#resetIinputs()
|
||||||
|
*/
|
||||||
|
public synchronized void resetInputs() {
|
||||||
|
super.resetInputs();
|
||||||
|
inputFiles = new Iterator(dataDir);
|
||||||
|
iteration = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* (non-Javadoc)
|
||||||
|
* @see DocMaker#numUniqueTexts()
|
||||||
|
*/
|
||||||
|
public int numUniqueTexts() {
|
||||||
|
return inputFiles.getCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -17,19 +17,20 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
|
||||||
import org.apache.lucene.store.Directory;
|
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create an index.
|
* Create an index.
|
||||||
* <br>Other side effects: index writer object in perfRunData is set.
|
* <br>Other side effects: index writer object in perfRunData is set.
|
||||||
* <br>Relevant properties: <code>merge.factor , max.buffered</code>.
|
* <br>Relevant properties: <code>merge.factor, max.buffered,
|
||||||
|
* max.field.length</code>.
|
||||||
*/
|
*/
|
||||||
public class CreateIndexTask extends PerfTask {
|
public class CreateIndexTask extends PerfTask {
|
||||||
|
|
||||||
|
@ -48,10 +49,12 @@ public class CreateIndexTask extends PerfTask {
|
||||||
boolean cmpnd = config.get("compound",true);
|
boolean cmpnd = config.get("compound",true);
|
||||||
int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR);
|
int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR);
|
||||||
int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED);
|
int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED);
|
||||||
|
int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH);
|
||||||
|
|
||||||
iw.setUseCompoundFile(cmpnd);
|
iw.setUseCompoundFile(cmpnd);
|
||||||
iw.setMergeFactor(mrgf);
|
iw.setMergeFactor(mrgf);
|
||||||
iw.setMaxBufferedDocs(mxbf);
|
iw.setMaxBufferedDocs(mxbf);
|
||||||
|
iw.setMaxFieldLength(mxfl);
|
||||||
|
|
||||||
getRunData().setIndexWriter(iw);
|
getRunData().setIndexWriter(iw);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
@ -17,23 +17,26 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
|
||||||
import org.apache.lucene.store.Directory;
|
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Open an index writer.
|
* Open an index writer.
|
||||||
* <br>Other side effects: index writer object in perfRunData is set.
|
* <br>Other side effects: index writer object in perfRunData is set.
|
||||||
* <br>Relevant properties: <code>merge.factor , max.buffered</code>.
|
* <br>Relevant properties: <code>merge.factor, max.buffered,
|
||||||
|
* max.field.length</code>.
|
||||||
|
</code>.
|
||||||
*/
|
*/
|
||||||
public class OpenIndexTask extends PerfTask {
|
public class OpenIndexTask extends PerfTask {
|
||||||
|
|
||||||
public static final int DEFAULT_MAX_BUFFERED = 10;
|
public static final int DEFAULT_MAX_BUFFERED = 10;
|
||||||
|
public static final int DEFAULT_MAX_FIELD_LENGTH = 10000;
|
||||||
public static final int DEFAULT_MERGE_PFACTOR = 10;
|
public static final int DEFAULT_MERGE_PFACTOR = 10;
|
||||||
|
|
||||||
public OpenIndexTask(PerfRunData runData) {
|
public OpenIndexTask(PerfRunData runData) {
|
||||||
|
@ -50,9 +53,11 @@ public class OpenIndexTask extends PerfTask {
|
||||||
boolean cmpnd = config.get("compound",true);
|
boolean cmpnd = config.get("compound",true);
|
||||||
int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR);
|
int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR);
|
||||||
int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED);
|
int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED);
|
||||||
|
int mxfl = config.get("max.field.length",DEFAULT_MAX_FIELD_LENGTH);
|
||||||
|
|
||||||
// must update params for newly opened writer
|
// must update params for newly opened writer
|
||||||
writer.setMaxBufferedDocs(mxbf);
|
writer.setMaxBufferedDocs(mxbf);
|
||||||
|
writer.setMaxFieldLength(mxfl);
|
||||||
writer.setMergeFactor(mrgf);
|
writer.setMergeFactor(mrgf);
|
||||||
writer.setUseCompoundFile(cmpnd); // this one redundant?
|
writer.setUseCompoundFile(cmpnd); // this one redundant?
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,211 @@
|
||||||
|
package org.apache.lucene.benchmark.utils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.xml.sax.Attributes;
|
||||||
|
import org.xml.sax.InputSource;
|
||||||
|
import org.xml.sax.XMLReader;
|
||||||
|
import org.xml.sax.helpers.DefaultHandler;
|
||||||
|
import org.xml.sax.helpers.XMLReaderFactory;
|
||||||
|
|
||||||
|
import javax.xml.parsers.SAXParser;
|
||||||
|
import javax.xml.parsers.SAXParserFactory;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileWriter;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract the downloaded Wikipedia dump into separate files for indexing.
|
||||||
|
*/
|
||||||
|
public class ExtractWikipedia {
|
||||||
|
|
||||||
|
private File wikipedia;
|
||||||
|
private File outputDir;
|
||||||
|
|
||||||
|
public ExtractWikipedia(File wikipedia, File outputDir) {
|
||||||
|
this.wikipedia = wikipedia;
|
||||||
|
this.outputDir = outputDir;
|
||||||
|
System.out.println("Deleting all files in " + outputDir);
|
||||||
|
File [] files = outputDir.listFiles();
|
||||||
|
for (int i = 0; i < files.length; i++) {
|
||||||
|
files[i].delete();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static public int count = 0;
|
||||||
|
static String[] months = {"JAN", "FEB", "MAR", "APR",
|
||||||
|
"MAY", "JUN", "JUL", "AUG",
|
||||||
|
"SEP", "OCT", "NOV", "DEC"};
|
||||||
|
|
||||||
|
public class Parser extends DefaultHandler {
|
||||||
|
|
||||||
|
public Parser() {
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuffer contents = new StringBuffer();
|
||||||
|
|
||||||
|
public void characters(char[] ch, int start, int length) {
|
||||||
|
contents.append(ch, start, length);
|
||||||
|
}
|
||||||
|
|
||||||
|
String title;
|
||||||
|
String id;
|
||||||
|
String body;
|
||||||
|
String time;
|
||||||
|
|
||||||
|
static final int BASE = 10;
|
||||||
|
|
||||||
|
public void startElement(String namespace,
|
||||||
|
String simple,
|
||||||
|
String qualified,
|
||||||
|
Attributes attributes) {
|
||||||
|
if (qualified.equals("page")) {
|
||||||
|
title = null;
|
||||||
|
id = null;
|
||||||
|
body = null;
|
||||||
|
time = null;
|
||||||
|
} else if (qualified.equals("text")) {
|
||||||
|
contents.setLength(0);
|
||||||
|
} else if (qualified.equals("timestamp")) {
|
||||||
|
contents.setLength(0);
|
||||||
|
} else if (qualified.equals("title")) {
|
||||||
|
contents.setLength(0);
|
||||||
|
} else if (qualified.equals("id")) {
|
||||||
|
contents.setLength(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public File directory (int count, File directory) {
|
||||||
|
if (directory == null) {
|
||||||
|
directory = outputDir;
|
||||||
|
}
|
||||||
|
int base = BASE;
|
||||||
|
while (base <= count) {
|
||||||
|
base *= BASE;
|
||||||
|
}
|
||||||
|
if (count < BASE) {
|
||||||
|
return directory;
|
||||||
|
}
|
||||||
|
directory = new File (directory, (Integer.toString(base / BASE)));
|
||||||
|
directory = new File (directory, (Integer.toString(count / (base / BASE))));
|
||||||
|
return directory(count % (base / BASE), directory);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void create(String id, String title, String time, String body) {
|
||||||
|
|
||||||
|
File d = directory(count++, null);
|
||||||
|
d.mkdirs();
|
||||||
|
File f = new File(d, id + ".txt");
|
||||||
|
|
||||||
|
StringBuffer contents = new StringBuffer();
|
||||||
|
|
||||||
|
contents.append(time);
|
||||||
|
contents.append("\n\n");
|
||||||
|
contents.append(title);
|
||||||
|
contents.append("\n\n");
|
||||||
|
contents.append(body);
|
||||||
|
contents.append("\n");
|
||||||
|
|
||||||
|
try {
|
||||||
|
FileWriter writer = new FileWriter(f);
|
||||||
|
writer.write(contents.toString());
|
||||||
|
writer.close();
|
||||||
|
} catch (IOException ioe) {
|
||||||
|
throw new RuntimeException(ioe);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
String time(String original) {
|
||||||
|
StringBuffer buffer = new StringBuffer();
|
||||||
|
|
||||||
|
buffer.append(original.substring(8, 10));
|
||||||
|
buffer.append('-');
|
||||||
|
buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
|
||||||
|
buffer.append('-');
|
||||||
|
buffer.append(original.substring(0, 4));
|
||||||
|
buffer.append(' ');
|
||||||
|
buffer.append(original.substring(11, 19));
|
||||||
|
buffer.append(".000");
|
||||||
|
|
||||||
|
return buffer.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void endElement(String namespace, String simple, String qualified) {
|
||||||
|
if (qualified.equals("title")) {
|
||||||
|
title = contents.toString();
|
||||||
|
} else if (qualified.equals("text")) {
|
||||||
|
body = contents.toString();
|
||||||
|
if (body.startsWith("#REDIRECT") ||
|
||||||
|
body.startsWith("#redirect")) {
|
||||||
|
body = null;
|
||||||
|
}
|
||||||
|
} else if (qualified.equals("timestamp")) {
|
||||||
|
time = time(contents.toString());
|
||||||
|
} else if (qualified.equals("id") && id == null) {
|
||||||
|
id = contents.toString();
|
||||||
|
} else if (qualified.equals("page")) {
|
||||||
|
if (body != null) {
|
||||||
|
create(id, title, time, body);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void extract() {
|
||||||
|
|
||||||
|
try {
|
||||||
|
Parser parser = new Parser();
|
||||||
|
if (false) {
|
||||||
|
SAXParser sp = SAXParserFactory.newInstance().newSAXParser();
|
||||||
|
sp.parse(new FileInputStream(wikipedia), parser);
|
||||||
|
} else {
|
||||||
|
XMLReader reader =
|
||||||
|
XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
|
||||||
|
reader.setContentHandler(parser);
|
||||||
|
reader.setErrorHandler(parser);
|
||||||
|
reader.parse(new InputSource(new FileInputStream(wikipedia)));
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
if (args.length != 2) {
|
||||||
|
printUsage();
|
||||||
|
}
|
||||||
|
|
||||||
|
File wikipedia = new File(args[0]);
|
||||||
|
|
||||||
|
if (wikipedia.exists()) {
|
||||||
|
File outputDir = new File(args[1]);
|
||||||
|
outputDir.mkdirs();
|
||||||
|
ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, outputDir);
|
||||||
|
extractor.extract();
|
||||||
|
} else {
|
||||||
|
printUsage();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void printUsage() {
|
||||||
|
System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia <Path to Wikipedia XML file> <Output Path>");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue