LUCENE-848. Add Wikipedia benchmarking support

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@552229 13f79535-47bb-0310-9956-ffa450edef68
2007-07-01 02:19:10 +00:00 · 2007-07-01 02:19:10 +00:00 · bc7c586468
parent 9ff9bf8142
commit bc7c586468
11 changed files with 579 additions and 29 deletions
--- a/contrib/benchmark/.rsync-filter
+++ b/contrib/benchmark/.rsync-filter
@ -0,0 +1,2 @@
+- /work
+- /temp
--- a/contrib/benchmark/CHANGES.txt
+++ b/contrib/benchmark/CHANGES.txt
@ -4,6 +4,9 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety

 $Id:$

+6/30/07
+  LUCENE-848: Added support for Wikipedia benchmarking.
+
 6/25/07
 - LUCENE-940: Multi-threaded issues fixed: SimpleDateFormat; logging for addDoc/deleteDoc tasks.
 - LUCENE-945: tests fail to find data dirs. Added sys-prop benchmark.work.dir and cfg-prop work.dir.
--- a/contrib/benchmark/README.enwiki
+++ b/contrib/benchmark/README.enwiki
@ -0,0 +1,22 @@
+Support exists for downloading, parsing, and loading the English
+version of wikipedia (enwiki).
+
+The build file can automatically try to download the most current
+enwiki dataset (pages-articles.xml.bz2) from the "latest" directory,
+http://download.wikimedia.org/enwiki/latest/. However, this file
+doesn't always exist, depending on where wikipedia is in the dump
+process and whether prior dumps have succeeded. If this file doesn't
+exist, you can sometimes find an older or in progress version by
+looking in the dated directories under
+http://download.wikimedia.org/enwiki/. For example, as of this
+writing, there is a page file in
+http://download.wikimedia.org/enwiki/20070402/. You can download this
+file manually and put it in temp. Note that the file you download will
+probably have the date in the name, e.g.,
+http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2. When
+you put it in temp, rename it to enwiki-latest-pages-articles.xml.bz2.
+
+After that, ant enwiki should process the data set and run a load
+test. Ant targets get-enwiki, expand-enwiki, and extract-enwiki can
+also be used to download, decompress, and extract (to individual files
+in work/enwiki) the dataset, respectively.
--- a/contrib/benchmark/build.xml
+++ b/contrib/benchmark/build.xml
@ -1,22 +1,4 @@
 <?xml version="1.0"?>
-
-<!--
-    Licensed to the Apache Software Foundation (ASF) under one or more
-    contributor license agreements.  See the NOTICE file distributed with
-    this work for additional information regarding copyright ownership.
-    The ASF licenses this file to You under the Apache License, Version 2.0
-    the "License"); you may not use this file except in compliance with
-    the License.  You may obtain a copy of the License at
- 
-        http://www.apache.org/licenses/LICENSE-2.0
- 
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
- -->
-
 <project name="benchmark" default="default">

    <description>
@ -39,6 +21,34 @@
        <available file="${working.dir}/20news-18828" property="20news-18828.expanded"/>
        <available file="${working.dir}/mini_newsgroups" property="mini.expanded"/>
        
+        <available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
+        <available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
+        <available file="${working.dir}/enwiki" property="enwiki.extracted"/>
+
+    </target>
+
+    <target name="enwiki-files" depends="check-files">
+        <mkdir dir="temp"/>
+        <antcall target="get-enwiki"/>
+        <antcall target="expand-enwiki"/>
+        <antcall target="extract-enwiki"/>
+    </target>
+
+    <target name="get-enwiki" unless="enwiki.exists">
+        <get src="http://people.apache.org/~gsingers/wikipedia/enwiki-20070527-pages-articles.xml.bz2"
+             dest="temp/enwiki-20070527-pages-articles.xml.bz2"/>
+    </target>
+
+    <target name="expand-enwiki"  unless="enwiki.expanded">
+        <bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
+    </target>
+
+    <target name="extract-enwiki" depends="check-files" unless="enwiki.extracted">
+        <mkdir dir="${working.dir}/enwiki"/>
+        <java classname="org.apache.lucene.benchmark.utils.ExtractWikipedia" maxmemory="1024M" fork="true">
+            <classpath refid="run.classpath"/>
+            <arg line="temp/enwiki-20070527-pages-articles.xml ${working.dir}/enwiki"/>
+        </java>
    </target>

    <target name="get-news-20" unless="20news-18828.exists">
@ -102,6 +112,8 @@
    <property name="collections.jar" value="commons-collections-3.1.jar"/>
    <property name="logging.jar" value="commons-logging-1.0.4.jar"/>
    <property name="bean-utils.jar" value="commons-beanutils-1.7.0.jar"/>
+    <property name="xercesImpl.jar" value="xerces-2.9.0.jar"/>
+    <property name="xml-apis.jar" value="xml-apis-2.9.0.jar"/>

    <path id="classpath">
        <pathelement path="${common.dir}/build/classes/java"/>
@ -110,6 +122,8 @@
        <pathelement path="${basedir}/lib/${collections.jar}"/>
        <pathelement path="${basedir}/lib/${logging.jar}"/>
        <pathelement path="${basedir}/lib/${bean-utils.jar}"/>
+        <pathelement path="${basedir}/lib/${xercesImpl.jar}"/>
+        <pathelement path="${basedir}/lib/${xml-apis.jar}"/>
    </path>
    <path id="run.classpath">
        <path refid="classpath"/>
@ -143,13 +157,24 @@
        </java>
    </target>

+    <target name="enwiki" depends="compile,check-files,enwiki-files">
+        <echo>Working Directory: ${working.dir}</echo>
+        <java classname="org.apache.lucene.benchmark.byTask.Benchmark" maxmemory="1024M" fork="true">
+            <assertions>
+              <enable/>
+            </assertions>
+            <classpath refid="run.classpath"/>
+            <arg line="conf/wikipedia.alg"/>
+        </java>
+    </target>
+
    <target name="compile-demo">
      <subant target="compile-demo">
         <fileset dir="${common.dir}" includes="build.xml"/>
      </subant>
    </target> 

-    <target name="init" depends="contrib-build.init,compile-demo,check-files"/>
+    <target name="init" depends="common.init,compile-demo,check-files"/>

    <!-- make sure online collections (reuters) are first downloaded -->
    <target name="test" depends="init,get-files">
--- a/contrib/benchmark/conf/wikipedia.alg
+++ b/contrib/benchmark/conf/wikipedia.alg
@ -0,0 +1,65 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements.  See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+# multi val params are iterated by NewRound's, added to reports, start with column name.
+#
+# based on micro-standard
+#
+# modified to use wikipedia sources and index entire docs
+# currently just used to measure ingest rate
+
+merge.factor=mrg:10:100:10:100
+max.field.length=2147483647
+max.buffered=buf:10:10:100:100
+compound=true
+
+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+directory=FSDirectory
+
+doc.stored=true
+doc.tokenized=true
+doc.term.vector=false
+doc.add.log.step=500
+
+docs.dir=enwiki
+
+doc.maker=org.apache.lucene.benchmark.byTask.feeds.DirDocMaker
+
+query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
+
+# task at this depth or less would print when they start
+task.max.depth.log=2
+
+log.queries=false
+# -------------------------------------------------------------------------------------
+
+{ "Rounds"
+
+    ResetSystemErase
+
+    { "Populate"
+        CreateIndex
+        { "MAddDocs" AddDoc > : 200000
+        CloseIndex
+    }
+
+    NewRound
+
+} : 8
+
+RepSumByName
+RepSumByPrefRound MAddDocs
--- a/contrib/benchmark/lib/xerces-2.9.0.jar
+++ b/contrib/benchmark/lib/xerces-2.9.0.jar
@ -0,0 +1,2 @@
+AnyObjectId[99ee39d5be4f9700474691d8a5ed0a5058e27f7b] was removed in git history.
+Apache SVN contains full history.
--- a/contrib/benchmark/lib/xml-apis-2.9.0.jar
+++ b/contrib/benchmark/lib/xml-apis-2.9.0.jar
@ -0,0 +1,2 @@
+AnyObjectId[d42c0ea6cfd17ed6b444b8337febbc0bdb55ed83] was removed in git history.
+Apache SVN contains full history.
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java
@ -0,0 +1,210 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileReader;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.Locale;
+import java.util.Stack;
+
+/**
+ * A DocMaker using the Dir collection for its input.
+ *
+ * Config properties:
+ * docs.dir=&lt;path to the docs dir| Default: dir-out&gt;
+
+ *
+ */
+public class DirDocMaker extends BasicDocMaker {
+
+  private DateFormat dateFormat;
+  private File dataDir = null;
+  private int iteration=0;
+  
+  static public class Iterator implements java.util.Iterator {
+
+    int count = 0;
+
+    public int getCount(){
+      return count;
+    }
+
+    Stack stack = new Stack();
+
+    /* this seems silly ... there must be a better way ...
+       not that this is good, but can it matter? */
+
+    static class Comparator implements java.util.Comparator {
+      public int compare(Object _a, Object _b) {
+        String a = _a.toString();
+        String b = _b.toString();
+
+        int diff = a.length() - b.length();
+
+        if (diff > 0) {
+          while (diff-- > 0) {
+            b = "0" + b;
+          }
+        } else if (diff < 0) {
+          diff = -diff;
+          while (diff-- > 0) {
+            a = "0" + a;
+          }
+        }
+
+        /* note it's reversed because we're going to push,
+           which reverses again */
+        return b.compareTo(a);
+      }
+    }
+
+    Comparator c = new Comparator();
+
+    void push(File[] files) {
+      Arrays.sort(files, c);
+      for(int i = 0; i < files.length; i++) {
+        // System.err.println("push " + files[i]);
+        stack.push(files[i]);
+      }
+    }
+
+    void push(File f) {
+      push(f.listFiles(new FileFilter() {
+          public boolean accept(File f) { return f.isDirectory(); } }));
+      push(f.listFiles(new FileFilter() {
+          public boolean accept(File f) { return f.getName().endsWith(".txt"); } }));
+      find();
+    }
+
+    void find() {
+      if (stack.empty()) {
+        return;
+      }
+      if (!((File)stack.peek()).isDirectory()) {
+        return;
+      }
+      File f = (File)stack.pop();
+      push(f);
+    }
+
+    public Iterator(File f) {
+      push(f);
+    }
+
+    public void remove() {
+      throw new RuntimeException("cannot");
+    }
+    
+    public boolean hasNext() {
+      return stack.size() > 0;
+    }
+
+    public Object next() {
+      assert hasNext();
+      count++;
+      Object object = stack.pop();
+      // System.err.println("pop " + object);
+      find();
+      return object;
+    }
+
+  }
+
+  private Iterator inputFiles = null;
+
+  /* (non-Javadoc)
+   * @see SimpleDocMaker#setConfig(java.util.Properties)
+   */
+  public void setConfig(Config config) {
+    super.setConfig(config);
+    String d = config.get("docs.dir", "dir-out");
+    dataDir = new File(new File("work"), d);
+
+    inputFiles = new Iterator(dataDir);
+
+    if (inputFiles==null) {
+      throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
+    }
+    // date format: 30-MAR-1987 14:22:36
+    dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss",Locale.US);
+    dateFormat.setLenient(true);
+  }
+
+  protected DocData getNextDocData() throws Exception {
+    File f = null;
+    String name = null;
+    synchronized (this) {
+      if (!inputFiles.hasNext()) { 
+        // exhausted files, start a new round, unless forever set to false.
+        if (!forever) {
+          throw new NoMoreDataException();
+        }
+        inputFiles = new Iterator(dataDir);
+        iteration++;
+      }
+      f = (File) inputFiles.next();
+      // System.err.println(f);
+      name = f.getCanonicalPath()+"_"+iteration;
+    }
+    
+    BufferedReader reader = new BufferedReader(new FileReader(f));
+    String line = null;
+    //First line is the date, 3rd is the title, rest is body
+    String dateStr = reader.readLine();
+    reader.readLine();//skip an empty line
+    String title = reader.readLine();
+    reader.readLine();//skip an empty line
+    StringBuffer bodyBuf = new StringBuffer(1024);
+    while ((line = reader.readLine()) != null) {
+      bodyBuf.append(line).append(' ');
+    }
+    reader.close();
+    addBytes(f.length());
+    
+    Date date = dateFormat.parse(dateStr.trim()); 
+    return new DocData(name, bodyBuf.toString(), title, null, date);
+  }
+
+
+  /*
+   *  (non-Javadoc)
+   * @see DocMaker#resetIinputs()
+   */
+  public synchronized void resetInputs() {
+    super.resetInputs();
+    inputFiles = new Iterator(dataDir);
+    iteration = 0;
+  }
+
+  /*
+   *  (non-Javadoc)
+   * @see DocMaker#numUniqueTexts()
+   */
+  public int numUniqueTexts() {
+    return inputFiles.getCount();
+  }
+
+}
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
@ -17,19 +17,20 @@ package org.apache.lucene.benchmark.byTask.tasks;
 * limitations under the License.
 */

-import java.io.IOException;
-
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.store.Directory;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.store.Directory;
+
+import java.io.IOException;


 /**
 * Create an index.
 * <br>Other side effects: index writer object in perfRunData is set.
- * <br>Relevant properties: <code>merge.factor , max.buffered</code>.
+ * <br>Relevant properties: <code>merge.factor, max.buffered,
+ *  max.field.length</code>.
 */
 public class CreateIndexTask extends PerfTask {

@ -48,10 +49,12 @@ public class CreateIndexTask extends PerfTask {
    boolean cmpnd = config.get("compound",true);
    int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR);
    int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED);
+    int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH);

    iw.setUseCompoundFile(cmpnd);
    iw.setMergeFactor(mrgf);
    iw.setMaxBufferedDocs(mxbf);
+    iw.setMaxFieldLength(mxfl);

    getRunData().setIndexWriter(iw);
    return 1;
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
@ -17,23 +17,26 @@ package org.apache.lucene.benchmark.byTask.tasks;
 * limitations under the License.
 */

-import java.io.IOException;
-
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.store.Directory;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.store.Directory;
+
+import java.io.IOException;


 /**
 * Open an index writer.
 * <br>Other side effects: index writer object in perfRunData is set.
- * <br>Relevant properties: <code>merge.factor , max.buffered</code>.
+ * <br>Relevant properties: <code>merge.factor, max.buffered,
+ * max.field.length</code>.
+</code>.
 */
 public class OpenIndexTask extends PerfTask {

  public static final int DEFAULT_MAX_BUFFERED = 10;
+  public static final int DEFAULT_MAX_FIELD_LENGTH = 10000;
  public static final int DEFAULT_MERGE_PFACTOR = 10;

  public OpenIndexTask(PerfRunData runData) {
@ -50,9 +53,11 @@ public class OpenIndexTask extends PerfTask {
    boolean cmpnd = config.get("compound",true);
    int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR);
    int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED);
+    int mxfl = config.get("max.field.length",DEFAULT_MAX_FIELD_LENGTH);

    // must update params for newly opened writer
    writer.setMaxBufferedDocs(mxbf);
+    writer.setMaxFieldLength(mxfl);
    writer.setMergeFactor(mrgf);
    writer.setUseCompoundFile(cmpnd); // this one redundant?
    
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java
@ -0,0 +1,211 @@
+package org.apache.lucene.benchmark.utils;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.XMLReader;
+import org.xml.sax.helpers.DefaultHandler;
+import org.xml.sax.helpers.XMLReaderFactory;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+
+/**
+ * Extract the downloaded Wikipedia dump into separate files for indexing.
+ */
+public class ExtractWikipedia {
+
+  private File wikipedia;
+  private File outputDir;
+
+  public ExtractWikipedia(File wikipedia, File outputDir) {
+    this.wikipedia = wikipedia;
+    this.outputDir = outputDir;
+    System.out.println("Deleting all files in " + outputDir);
+    File [] files = outputDir.listFiles();
+    for (int i = 0; i < files.length; i++) {
+      files[i].delete();
+    }
+  }
+
+  static public int count = 0;
+  static String[] months = {"JAN", "FEB", "MAR", "APR",
+                            "MAY", "JUN", "JUL", "AUG",
+                            "SEP", "OCT", "NOV", "DEC"};
+
+  public class Parser extends DefaultHandler {
+
+    public Parser() {
+    }
+
+    StringBuffer contents = new StringBuffer();
+
+    public void characters(char[] ch, int start, int length) {
+      contents.append(ch, start, length);
+    }
+
+    String title;
+    String id;
+    String body;
+    String time;
+
+    static final int BASE = 10;
+    
+    public void startElement(String namespace,
+                             String simple,
+                             String qualified,
+                             Attributes attributes) {
+      if (qualified.equals("page")) {
+        title = null;
+        id = null;
+        body = null;
+        time = null;
+      } else if (qualified.equals("text")) {
+        contents.setLength(0);
+      } else if (qualified.equals("timestamp")) {
+        contents.setLength(0);
+      } else if (qualified.equals("title")) {
+        contents.setLength(0);
+      } else if (qualified.equals("id")) {
+        contents.setLength(0);
+      }
+    }
+
+    public File directory (int count, File directory) {
+      if (directory == null) {
+        directory = outputDir;
+      }
+      int base = BASE;
+      while (base <= count) {
+        base *= BASE;
+      }
+      if (count < BASE) {
+        return directory;
+      }
+      directory = new File (directory, (Integer.toString(base / BASE)));
+      directory = new File (directory, (Integer.toString(count / (base / BASE))));
+      return directory(count % (base / BASE), directory);
+    }
+
+    public void create(String id, String title, String time, String body) {
+
+      File d = directory(count++, null);
+      d.mkdirs();
+      File f = new File(d, id + ".txt");
+
+      StringBuffer contents = new StringBuffer();
+      
+      contents.append(time);
+      contents.append("\n\n");
+      contents.append(title);
+      contents.append("\n\n");
+      contents.append(body);
+      contents.append("\n");
+
+      try {
+        FileWriter writer = new FileWriter(f);
+        writer.write(contents.toString());
+        writer.close();
+      } catch (IOException ioe) {
+        throw new RuntimeException(ioe);
+      }
+
+    }
+
+    String time(String original) {
+      StringBuffer buffer = new StringBuffer();
+
+      buffer.append(original.substring(8, 10));
+      buffer.append('-');
+      buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
+      buffer.append('-');
+      buffer.append(original.substring(0, 4));
+      buffer.append(' ');
+      buffer.append(original.substring(11, 19));
+      buffer.append(".000");
+
+      return buffer.toString();
+    }
+
+    public void endElement(String namespace, String simple, String qualified) {
+      if (qualified.equals("title")) {
+        title = contents.toString();
+      } else if (qualified.equals("text")) {
+        body = contents.toString();
+        if (body.startsWith("#REDIRECT") ||
+             body.startsWith("#redirect")) {
+          body = null;
+        }
+      } else if (qualified.equals("timestamp")) {
+        time = time(contents.toString());
+      } else if (qualified.equals("id") && id == null) {
+        id = contents.toString();
+      } else if (qualified.equals("page")) {
+        if (body != null) {
+          create(id, title, time, body);
+        }
+      }
+    }
+  }
+
+  public void extract() {
+
+    try {
+      Parser parser = new Parser();
+      if (false) {
+        SAXParser sp = SAXParserFactory.newInstance().newSAXParser();
+        sp.parse(new FileInputStream(wikipedia), parser);
+      } else {
+        XMLReader reader =
+          XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
+        reader.setContentHandler(parser);
+        reader.setErrorHandler(parser);
+        reader.parse(new InputSource(new FileInputStream(wikipedia)));
+      }
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  public static void main(String[] args) {
+    if (args.length != 2) {
+      printUsage();
+    }
+
+    File wikipedia = new File(args[0]);
+
+    if (wikipedia.exists()) {
+      File outputDir = new File(args[1]);
+      outputDir.mkdirs();
+      ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, outputDir);
+      extractor.extract();
+    } else {
+      printUsage();
+    }
+  }
+
+  private static void printUsage() {
+    System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia <Path to Wikipedia XML file> <Output Path>");
+  }
+
+}