mirror of https://github.com/apache/lucene.git
LUCENE-848. Add Wikipedia benchmarking support
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@552229 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9ff9bf8142
commit
bc7c586468
|
@ -0,0 +1,2 @@
|
|||
- /work
|
||||
- /temp
|
|
@ -4,6 +4,9 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
|
|||
|
||||
$Id:$
|
||||
|
||||
6/30/07
|
||||
LUCENE-848: Added support for Wikipedia benchmarking.
|
||||
|
||||
6/25/07
|
||||
- LUCENE-940: Multi-threaded issues fixed: SimpleDateFormat; logging for addDoc/deleteDoc tasks.
|
||||
- LUCENE-945: tests fail to find data dirs. Added sys-prop benchmark.work.dir and cfg-prop work.dir.
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
Support exists for downloading, parsing, and loading the English
|
||||
version of wikipedia (enwiki).
|
||||
|
||||
The build file can automatically try to download the most current
|
||||
enwiki dataset (pages-articles.xml.bz2) from the "latest" directory,
|
||||
http://download.wikimedia.org/enwiki/latest/. However, this file
|
||||
doesn't always exist, depending on where wikipedia is in the dump
|
||||
process and whether prior dumps have succeeded. If this file doesn't
|
||||
exist, you can sometimes find an older or in progress version by
|
||||
looking in the dated directories under
|
||||
http://download.wikimedia.org/enwiki/. For example, as of this
|
||||
writing, there is a page file in
|
||||
http://download.wikimedia.org/enwiki/20070402/. You can download this
|
||||
file manually and put it in temp. Note that the file you download will
|
||||
probably have the date in the name, e.g.,
|
||||
http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2. When
|
||||
you put it in temp, rename it to enwiki-latest-pages-articles.xml.bz2.
|
||||
|
||||
After that, ant enwiki should process the data set and run a load
|
||||
test. Ant targets get-enwiki, expand-enwiki, and extract-enwiki can
|
||||
also be used to download, decompress, and extract (to individual files
|
||||
in work/enwiki) the dataset, respectively.
|
|
@ -1,22 +1,4 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="benchmark" default="default">
|
||||
|
||||
<description>
|
||||
|
@ -39,6 +21,34 @@
|
|||
<available file="${working.dir}/20news-18828" property="20news-18828.expanded"/>
|
||||
<available file="${working.dir}/mini_newsgroups" property="mini.expanded"/>
|
||||
|
||||
<available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
|
||||
<available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
|
||||
<available file="${working.dir}/enwiki" property="enwiki.extracted"/>
|
||||
|
||||
</target>
|
||||
|
||||
<target name="enwiki-files" depends="check-files">
|
||||
<mkdir dir="temp"/>
|
||||
<antcall target="get-enwiki"/>
|
||||
<antcall target="expand-enwiki"/>
|
||||
<antcall target="extract-enwiki"/>
|
||||
</target>
|
||||
|
||||
<target name="get-enwiki" unless="enwiki.exists">
|
||||
<get src="http://people.apache.org/~gsingers/wikipedia/enwiki-20070527-pages-articles.xml.bz2"
|
||||
dest="temp/enwiki-20070527-pages-articles.xml.bz2"/>
|
||||
</target>
|
||||
|
||||
<target name="expand-enwiki" unless="enwiki.expanded">
|
||||
<bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
|
||||
</target>
|
||||
|
||||
<target name="extract-enwiki" depends="check-files" unless="enwiki.extracted">
|
||||
<mkdir dir="${working.dir}/enwiki"/>
|
||||
<java classname="org.apache.lucene.benchmark.utils.ExtractWikipedia" maxmemory="1024M" fork="true">
|
||||
<classpath refid="run.classpath"/>
|
||||
<arg line="temp/enwiki-20070527-pages-articles.xml ${working.dir}/enwiki"/>
|
||||
</java>
|
||||
</target>
|
||||
|
||||
<target name="get-news-20" unless="20news-18828.exists">
|
||||
|
@ -102,6 +112,8 @@
|
|||
<property name="collections.jar" value="commons-collections-3.1.jar"/>
|
||||
<property name="logging.jar" value="commons-logging-1.0.4.jar"/>
|
||||
<property name="bean-utils.jar" value="commons-beanutils-1.7.0.jar"/>
|
||||
<property name="xercesImpl.jar" value="xerces-2.9.0.jar"/>
|
||||
<property name="xml-apis.jar" value="xml-apis-2.9.0.jar"/>
|
||||
|
||||
<path id="classpath">
|
||||
<pathelement path="${common.dir}/build/classes/java"/>
|
||||
|
@ -110,6 +122,8 @@
|
|||
<pathelement path="${basedir}/lib/${collections.jar}"/>
|
||||
<pathelement path="${basedir}/lib/${logging.jar}"/>
|
||||
<pathelement path="${basedir}/lib/${bean-utils.jar}"/>
|
||||
<pathelement path="${basedir}/lib/${xercesImpl.jar}"/>
|
||||
<pathelement path="${basedir}/lib/${xml-apis.jar}"/>
|
||||
</path>
|
||||
<path id="run.classpath">
|
||||
<path refid="classpath"/>
|
||||
|
@ -143,13 +157,24 @@
|
|||
</java>
|
||||
</target>
|
||||
|
||||
<target name="enwiki" depends="compile,check-files,enwiki-files">
|
||||
<echo>Working Directory: ${working.dir}</echo>
|
||||
<java classname="org.apache.lucene.benchmark.byTask.Benchmark" maxmemory="1024M" fork="true">
|
||||
<assertions>
|
||||
<enable/>
|
||||
</assertions>
|
||||
<classpath refid="run.classpath"/>
|
||||
<arg line="conf/wikipedia.alg"/>
|
||||
</java>
|
||||
</target>
|
||||
|
||||
<target name="compile-demo">
|
||||
<subant target="compile-demo">
|
||||
<fileset dir="${common.dir}" includes="build.xml"/>
|
||||
</subant>
|
||||
</target>
|
||||
|
||||
<target name="init" depends="contrib-build.init,compile-demo,check-files"/>
|
||||
<target name="init" depends="common.init,compile-demo,check-files"/>
|
||||
|
||||
<!-- make sure online collections (reuters) are first downloaded -->
|
||||
<target name="test" depends="init,get-files">
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
#/**
|
||||
# * Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# * contributor license agreements. See the NOTICE file distributed with
|
||||
# * this work for additional information regarding copyright ownership.
|
||||
# * The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# * (the "License"); you may not use this file except in compliance with
|
||||
# * the License. You may obtain a copy of the License at
|
||||
# *
|
||||
# * http://www.apache.org/licenses/LICENSE-2.0
|
||||
# *
|
||||
# * Unless required by applicable law or agreed to in writing, software
|
||||
# * distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# * See the License for the specific language governing permissions and
|
||||
# * limitations under the License.
|
||||
# */
|
||||
# -------------------------------------------------------------------------------------
|
||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||
#
|
||||
# based on micro-standard
|
||||
#
|
||||
# modified to use wikipedia sources and index entire docs
|
||||
# currently just used to measure ingest rate
|
||||
|
||||
merge.factor=mrg:10:100:10:100
|
||||
max.field.length=2147483647
|
||||
max.buffered=buf:10:10:100:100
|
||||
compound=true
|
||||
|
||||
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
|
||||
directory=FSDirectory
|
||||
|
||||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=500
|
||||
|
||||
docs.dir=enwiki
|
||||
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.DirDocMaker
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
||||
# task at this depth or less would print when they start
|
||||
task.max.depth.log=2
|
||||
|
||||
log.queries=false
|
||||
# -------------------------------------------------------------------------------------
|
||||
|
||||
{ "Rounds"
|
||||
|
||||
ResetSystemErase
|
||||
|
||||
{ "Populate"
|
||||
CreateIndex
|
||||
{ "MAddDocs" AddDoc > : 200000
|
||||
CloseIndex
|
||||
}
|
||||
|
||||
NewRound
|
||||
|
||||
} : 8
|
||||
|
||||
RepSumByName
|
||||
RepSumByPrefRound MAddDocs
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[99ee39d5be4f9700474691d8a5ed0a5058e27f7b] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[d42c0ea6cfd17ed6b444b8337febbc0bdb55ed83] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,210 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileFilter;
|
||||
import java.io.FileReader;
|
||||
import java.text.DateFormat;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
import java.util.Stack;
|
||||
|
||||
/**
|
||||
* A DocMaker using the Dir collection for its input.
|
||||
*
|
||||
* Config properties:
|
||||
* docs.dir=<path to the docs dir| Default: dir-out>
|
||||
|
||||
*
|
||||
*/
|
||||
public class DirDocMaker extends BasicDocMaker {
|
||||
|
||||
private DateFormat dateFormat;
|
||||
private File dataDir = null;
|
||||
private int iteration=0;
|
||||
|
||||
static public class Iterator implements java.util.Iterator {
|
||||
|
||||
int count = 0;
|
||||
|
||||
public int getCount(){
|
||||
return count;
|
||||
}
|
||||
|
||||
Stack stack = new Stack();
|
||||
|
||||
/* this seems silly ... there must be a better way ...
|
||||
not that this is good, but can it matter? */
|
||||
|
||||
static class Comparator implements java.util.Comparator {
|
||||
public int compare(Object _a, Object _b) {
|
||||
String a = _a.toString();
|
||||
String b = _b.toString();
|
||||
|
||||
int diff = a.length() - b.length();
|
||||
|
||||
if (diff > 0) {
|
||||
while (diff-- > 0) {
|
||||
b = "0" + b;
|
||||
}
|
||||
} else if (diff < 0) {
|
||||
diff = -diff;
|
||||
while (diff-- > 0) {
|
||||
a = "0" + a;
|
||||
}
|
||||
}
|
||||
|
||||
/* note it's reversed because we're going to push,
|
||||
which reverses again */
|
||||
return b.compareTo(a);
|
||||
}
|
||||
}
|
||||
|
||||
Comparator c = new Comparator();
|
||||
|
||||
void push(File[] files) {
|
||||
Arrays.sort(files, c);
|
||||
for(int i = 0; i < files.length; i++) {
|
||||
// System.err.println("push " + files[i]);
|
||||
stack.push(files[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void push(File f) {
|
||||
push(f.listFiles(new FileFilter() {
|
||||
public boolean accept(File f) { return f.isDirectory(); } }));
|
||||
push(f.listFiles(new FileFilter() {
|
||||
public boolean accept(File f) { return f.getName().endsWith(".txt"); } }));
|
||||
find();
|
||||
}
|
||||
|
||||
void find() {
|
||||
if (stack.empty()) {
|
||||
return;
|
||||
}
|
||||
if (!((File)stack.peek()).isDirectory()) {
|
||||
return;
|
||||
}
|
||||
File f = (File)stack.pop();
|
||||
push(f);
|
||||
}
|
||||
|
||||
public Iterator(File f) {
|
||||
push(f);
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new RuntimeException("cannot");
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return stack.size() > 0;
|
||||
}
|
||||
|
||||
public Object next() {
|
||||
assert hasNext();
|
||||
count++;
|
||||
Object object = stack.pop();
|
||||
// System.err.println("pop " + object);
|
||||
find();
|
||||
return object;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private Iterator inputFiles = null;
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see SimpleDocMaker#setConfig(java.util.Properties)
|
||||
*/
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
String d = config.get("docs.dir", "dir-out");
|
||||
dataDir = new File(new File("work"), d);
|
||||
|
||||
inputFiles = new Iterator(dataDir);
|
||||
|
||||
if (inputFiles==null) {
|
||||
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
|
||||
}
|
||||
// date format: 30-MAR-1987 14:22:36
|
||||
dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss",Locale.US);
|
||||
dateFormat.setLenient(true);
|
||||
}
|
||||
|
||||
protected DocData getNextDocData() throws Exception {
|
||||
File f = null;
|
||||
String name = null;
|
||||
synchronized (this) {
|
||||
if (!inputFiles.hasNext()) {
|
||||
// exhausted files, start a new round, unless forever set to false.
|
||||
if (!forever) {
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
inputFiles = new Iterator(dataDir);
|
||||
iteration++;
|
||||
}
|
||||
f = (File) inputFiles.next();
|
||||
// System.err.println(f);
|
||||
name = f.getCanonicalPath()+"_"+iteration;
|
||||
}
|
||||
|
||||
BufferedReader reader = new BufferedReader(new FileReader(f));
|
||||
String line = null;
|
||||
//First line is the date, 3rd is the title, rest is body
|
||||
String dateStr = reader.readLine();
|
||||
reader.readLine();//skip an empty line
|
||||
String title = reader.readLine();
|
||||
reader.readLine();//skip an empty line
|
||||
StringBuffer bodyBuf = new StringBuffer(1024);
|
||||
while ((line = reader.readLine()) != null) {
|
||||
bodyBuf.append(line).append(' ');
|
||||
}
|
||||
reader.close();
|
||||
addBytes(f.length());
|
||||
|
||||
Date date = dateFormat.parse(dateStr.trim());
|
||||
return new DocData(name, bodyBuf.toString(), title, null, date);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see DocMaker#resetIinputs()
|
||||
*/
|
||||
public synchronized void resetInputs() {
|
||||
super.resetInputs();
|
||||
inputFiles = new Iterator(dataDir);
|
||||
iteration = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see DocMaker#numUniqueTexts()
|
||||
*/
|
||||
public int numUniqueTexts() {
|
||||
return inputFiles.getCount();
|
||||
}
|
||||
|
||||
}
|
|
@ -17,19 +17,20 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
* Create an index.
|
||||
* <br>Other side effects: index writer object in perfRunData is set.
|
||||
* <br>Relevant properties: <code>merge.factor , max.buffered</code>.
|
||||
* <br>Relevant properties: <code>merge.factor, max.buffered,
|
||||
* max.field.length</code>.
|
||||
*/
|
||||
public class CreateIndexTask extends PerfTask {
|
||||
|
||||
|
@ -48,10 +49,12 @@ public class CreateIndexTask extends PerfTask {
|
|||
boolean cmpnd = config.get("compound",true);
|
||||
int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR);
|
||||
int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED);
|
||||
int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH);
|
||||
|
||||
iw.setUseCompoundFile(cmpnd);
|
||||
iw.setMergeFactor(mrgf);
|
||||
iw.setMaxBufferedDocs(mxbf);
|
||||
iw.setMaxFieldLength(mxfl);
|
||||
|
||||
getRunData().setIndexWriter(iw);
|
||||
return 1;
|
||||
|
|
|
@ -17,23 +17,26 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
* Open an index writer.
|
||||
* <br>Other side effects: index writer object in perfRunData is set.
|
||||
* <br>Relevant properties: <code>merge.factor , max.buffered</code>.
|
||||
* <br>Relevant properties: <code>merge.factor, max.buffered,
|
||||
* max.field.length</code>.
|
||||
</code>.
|
||||
*/
|
||||
public class OpenIndexTask extends PerfTask {
|
||||
|
||||
public static final int DEFAULT_MAX_BUFFERED = 10;
|
||||
public static final int DEFAULT_MAX_FIELD_LENGTH = 10000;
|
||||
public static final int DEFAULT_MERGE_PFACTOR = 10;
|
||||
|
||||
public OpenIndexTask(PerfRunData runData) {
|
||||
|
@ -50,9 +53,11 @@ public class OpenIndexTask extends PerfTask {
|
|||
boolean cmpnd = config.get("compound",true);
|
||||
int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR);
|
||||
int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED);
|
||||
int mxfl = config.get("max.field.length",DEFAULT_MAX_FIELD_LENGTH);
|
||||
|
||||
// must update params for newly opened writer
|
||||
writer.setMaxBufferedDocs(mxbf);
|
||||
writer.setMaxFieldLength(mxfl);
|
||||
writer.setMergeFactor(mrgf);
|
||||
writer.setUseCompoundFile(cmpnd); // this one redundant?
|
||||
|
||||
|
|
|
@ -0,0 +1,211 @@
|
|||
package org.apache.lucene.benchmark.utils;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.XMLReader;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
import org.xml.sax.helpers.XMLReaderFactory;
|
||||
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Extract the downloaded Wikipedia dump into separate files for indexing.
|
||||
*/
|
||||
public class ExtractWikipedia {
|
||||
|
||||
private File wikipedia;
|
||||
private File outputDir;
|
||||
|
||||
public ExtractWikipedia(File wikipedia, File outputDir) {
|
||||
this.wikipedia = wikipedia;
|
||||
this.outputDir = outputDir;
|
||||
System.out.println("Deleting all files in " + outputDir);
|
||||
File [] files = outputDir.listFiles();
|
||||
for (int i = 0; i < files.length; i++) {
|
||||
files[i].delete();
|
||||
}
|
||||
}
|
||||
|
||||
static public int count = 0;
|
||||
static String[] months = {"JAN", "FEB", "MAR", "APR",
|
||||
"MAY", "JUN", "JUL", "AUG",
|
||||
"SEP", "OCT", "NOV", "DEC"};
|
||||
|
||||
public class Parser extends DefaultHandler {
|
||||
|
||||
public Parser() {
|
||||
}
|
||||
|
||||
StringBuffer contents = new StringBuffer();
|
||||
|
||||
public void characters(char[] ch, int start, int length) {
|
||||
contents.append(ch, start, length);
|
||||
}
|
||||
|
||||
String title;
|
||||
String id;
|
||||
String body;
|
||||
String time;
|
||||
|
||||
static final int BASE = 10;
|
||||
|
||||
public void startElement(String namespace,
|
||||
String simple,
|
||||
String qualified,
|
||||
Attributes attributes) {
|
||||
if (qualified.equals("page")) {
|
||||
title = null;
|
||||
id = null;
|
||||
body = null;
|
||||
time = null;
|
||||
} else if (qualified.equals("text")) {
|
||||
contents.setLength(0);
|
||||
} else if (qualified.equals("timestamp")) {
|
||||
contents.setLength(0);
|
||||
} else if (qualified.equals("title")) {
|
||||
contents.setLength(0);
|
||||
} else if (qualified.equals("id")) {
|
||||
contents.setLength(0);
|
||||
}
|
||||
}
|
||||
|
||||
public File directory (int count, File directory) {
|
||||
if (directory == null) {
|
||||
directory = outputDir;
|
||||
}
|
||||
int base = BASE;
|
||||
while (base <= count) {
|
||||
base *= BASE;
|
||||
}
|
||||
if (count < BASE) {
|
||||
return directory;
|
||||
}
|
||||
directory = new File (directory, (Integer.toString(base / BASE)));
|
||||
directory = new File (directory, (Integer.toString(count / (base / BASE))));
|
||||
return directory(count % (base / BASE), directory);
|
||||
}
|
||||
|
||||
public void create(String id, String title, String time, String body) {
|
||||
|
||||
File d = directory(count++, null);
|
||||
d.mkdirs();
|
||||
File f = new File(d, id + ".txt");
|
||||
|
||||
StringBuffer contents = new StringBuffer();
|
||||
|
||||
contents.append(time);
|
||||
contents.append("\n\n");
|
||||
contents.append(title);
|
||||
contents.append("\n\n");
|
||||
contents.append(body);
|
||||
contents.append("\n");
|
||||
|
||||
try {
|
||||
FileWriter writer = new FileWriter(f);
|
||||
writer.write(contents.toString());
|
||||
writer.close();
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
String time(String original) {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
|
||||
buffer.append(original.substring(8, 10));
|
||||
buffer.append('-');
|
||||
buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
|
||||
buffer.append('-');
|
||||
buffer.append(original.substring(0, 4));
|
||||
buffer.append(' ');
|
||||
buffer.append(original.substring(11, 19));
|
||||
buffer.append(".000");
|
||||
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
public void endElement(String namespace, String simple, String qualified) {
|
||||
if (qualified.equals("title")) {
|
||||
title = contents.toString();
|
||||
} else if (qualified.equals("text")) {
|
||||
body = contents.toString();
|
||||
if (body.startsWith("#REDIRECT") ||
|
||||
body.startsWith("#redirect")) {
|
||||
body = null;
|
||||
}
|
||||
} else if (qualified.equals("timestamp")) {
|
||||
time = time(contents.toString());
|
||||
} else if (qualified.equals("id") && id == null) {
|
||||
id = contents.toString();
|
||||
} else if (qualified.equals("page")) {
|
||||
if (body != null) {
|
||||
create(id, title, time, body);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void extract() {
|
||||
|
||||
try {
|
||||
Parser parser = new Parser();
|
||||
if (false) {
|
||||
SAXParser sp = SAXParserFactory.newInstance().newSAXParser();
|
||||
sp.parse(new FileInputStream(wikipedia), parser);
|
||||
} else {
|
||||
XMLReader reader =
|
||||
XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
|
||||
reader.setContentHandler(parser);
|
||||
reader.setErrorHandler(parser);
|
||||
reader.parse(new InputSource(new FileInputStream(wikipedia)));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
if (args.length != 2) {
|
||||
printUsage();
|
||||
}
|
||||
|
||||
File wikipedia = new File(args[0]);
|
||||
|
||||
if (wikipedia.exists()) {
|
||||
File outputDir = new File(args[1]);
|
||||
outputDir.mkdirs();
|
||||
ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, outputDir);
|
||||
extractor.extract();
|
||||
} else {
|
||||
printUsage();
|
||||
}
|
||||
}
|
||||
|
||||
private static void printUsage() {
|
||||
System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia <Path to Wikipedia XML file> <Output Path>");
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue