LUCENE-848. Add Wikipedia benchmarking support

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@552229 13f79535-47bb-0310-9956-ffa450edef68
2007-07-01 02:19:10 +00:00 · 2007-07-01 02:19:10 +00:00 · bc7c586468
parent 9ff9bf8142
commit bc7c586468
11 changed files with 579 additions and 29 deletions
--- a/contrib/benchmark/.rsync-filter
+++ b/contrib/benchmark/.rsync-filter
@ -0,0 +1,2 @@
 - /work
 - /temp
--- a/contrib/benchmark/CHANGES.txt
+++ b/contrib/benchmark/CHANGES.txt
@ -4,6 +4,9 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
 $Id:$
 6/30/07
  LUCENE-848: Added support for Wikipedia benchmarking.
 6/25/07
 - LUCENE-940: Multi-threaded issues fixed: SimpleDateFormat; logging for addDoc/deleteDoc tasks.
 - LUCENE-945: tests fail to find data dirs. Added sys-prop benchmark.work.dir and cfg-prop work.dir.
--- a/contrib/benchmark/README.enwiki
+++ b/contrib/benchmark/README.enwiki
@ -0,0 +1,22 @@
 Support exists for downloading, parsing, and loading the English
 version of wikipedia (enwiki).
 The build file can automatically try to download the most current
 enwiki dataset (pages-articles.xml.bz2) from the "latest" directory,
 http://download.wikimedia.org/enwiki/latest/. However, this file
 doesn't always exist, depending on where wikipedia is in the dump
 process and whether prior dumps have succeeded. If this file doesn't
 exist, you can sometimes find an older or in progress version by
 looking in the dated directories under
 http://download.wikimedia.org/enwiki/. For example, as of this
 writing, there is a page file in
 http://download.wikimedia.org/enwiki/20070402/. You can download this
 file manually and put it in temp. Note that the file you download will
 probably have the date in the name, e.g.,
 http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2. When
 you put it in temp, rename it to enwiki-latest-pages-articles.xml.bz2.
 After that, ant enwiki should process the data set and run a load
 test. Ant targets get-enwiki, expand-enwiki, and extract-enwiki can
 also be used to download, decompress, and extract (to individual files
 in work/enwiki) the dataset, respectively.
--- a/contrib/benchmark/build.xml
+++ b/contrib/benchmark/build.xml
@ -1,22 +1,4 @@
 <?xml version="1.0"?>
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
    this work for additional information regarding copyright ownership.
    The ASF licenses this file to You under the Apache License, Version 2.0
    the "License"); you may not use this file except in compliance with
    the License.  You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
 -->
 <project name="benchmark" default="default">
    <description>
@ -39,6 +21,34 @@
        <available file="${working.dir}/20news-18828" property="20news-18828.expanded"/>
        <available file="${working.dir}/mini_newsgroups" property="mini.expanded"/>
        <available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
        <available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
        <available file="${working.dir}/enwiki" property="enwiki.extracted"/>
    </target>
    <target name="enwiki-files" depends="check-files">
        <mkdir dir="temp"/>
        <antcall target="get-enwiki"/>
        <antcall target="expand-enwiki"/>
        <antcall target="extract-enwiki"/>
    </target>
    <target name="get-enwiki" unless="enwiki.exists">
        <get src="http://people.apache.org/~gsingers/wikipedia/enwiki-20070527-pages-articles.xml.bz2"
             dest="temp/enwiki-20070527-pages-articles.xml.bz2"/>
    </target>
    <target name="expand-enwiki"  unless="enwiki.expanded">
        <bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
    </target>
    <target name="extract-enwiki" depends="check-files" unless="enwiki.extracted">
        <mkdir dir="${working.dir}/enwiki"/>
        <java classname="org.apache.lucene.benchmark.utils.ExtractWikipedia" maxmemory="1024M" fork="true">
            <classpath refid="run.classpath"/>
            <arg line="temp/enwiki-20070527-pages-articles.xml ${working.dir}/enwiki"/>
        </java>
    </target>
    <target name="get-news-20" unless="20news-18828.exists">
@ -102,6 +112,8 @@
    <property name="collections.jar" value="commons-collections-3.1.jar"/>
    <property name="logging.jar" value="commons-logging-1.0.4.jar"/>
    <property name="bean-utils.jar" value="commons-beanutils-1.7.0.jar"/>
    <property name="xercesImpl.jar" value="xerces-2.9.0.jar"/>
    <property name="xml-apis.jar" value="xml-apis-2.9.0.jar"/>
    <path id="classpath">
        <pathelement path="${common.dir}/build/classes/java"/>
@ -110,6 +122,8 @@
        <pathelement path="${basedir}/lib/${collections.jar}"/>
        <pathelement path="${basedir}/lib/${logging.jar}"/>
        <pathelement path="${basedir}/lib/${bean-utils.jar}"/>
        <pathelement path="${basedir}/lib/${xercesImpl.jar}"/>
        <pathelement path="${basedir}/lib/${xml-apis.jar}"/>
    </path>
    <path id="run.classpath">
        <path refid="classpath"/>
@ -143,13 +157,24 @@
        </java>
    </target>
    <target name="enwiki" depends="compile,check-files,enwiki-files">
        <echo>Working Directory: ${working.dir}</echo>
        <java classname="org.apache.lucene.benchmark.byTask.Benchmark" maxmemory="1024M" fork="true">
            <assertions>
              <enable/>
            </assertions>
            <classpath refid="run.classpath"/>
            <arg line="conf/wikipedia.alg"/>
        </java>
    </target>
    <target name="compile-demo">
      <subant target="compile-demo">
         <fileset dir="${common.dir}" includes="build.xml"/>
      </subant>
    </target> 
-    <target name="init" depends="contrib-build.init,compile-demo,check-files"/>
+    <target name="init" depends="common.init,compile-demo,check-files"/>
    <!-- make sure online collections (reuters) are first downloaded -->
    <target name="test" depends="init,get-files">
--- a/contrib/benchmark/conf/wikipedia.alg
+++ b/contrib/benchmark/conf/wikipedia.alg
@ -0,0 +1,65 @@
 #/**
 # * Licensed to the Apache Software Foundation (ASF) under one or more
 # * contributor license agreements.  See the NOTICE file distributed with
 # * this work for additional information regarding copyright ownership.
 # * The ASF licenses this file to You under the Apache License, Version 2.0
 # * (the "License"); you may not use this file except in compliance with
 # * the License.  You may obtain a copy of the License at
 # *
 # *     http://www.apache.org/licenses/LICENSE-2.0
 # *
 # * Unless required by applicable law or agreed to in writing, software
 # * distributed under the License is distributed on an "AS IS" BASIS,
 # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # * See the License for the specific language governing permissions and
 # * limitations under the License.
 # */
 # -------------------------------------------------------------------------------------
 # multi val params are iterated by NewRound's, added to reports, start with column name.
 #
 # based on micro-standard
 #
 # modified to use wikipedia sources and index entire docs
 # currently just used to measure ingest rate
 merge.factor=mrg:10:100:10:100
 max.field.length=2147483647
 max.buffered=buf:10:10:100:100
 compound=true
 analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
 directory=FSDirectory
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
 doc.add.log.step=500
 docs.dir=enwiki
 doc.maker=org.apache.lucene.benchmark.byTask.feeds.DirDocMaker
 query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
 # task at this depth or less would print when they start
 task.max.depth.log=2
 log.queries=false
 # -------------------------------------------------------------------------------------
 { "Rounds"
    ResetSystemErase
    { "Populate"
        CreateIndex
        { "MAddDocs" AddDoc > : 200000
        CloseIndex
    }
    NewRound
 } : 8
 RepSumByName
 RepSumByPrefRound MAddDocs
--- a/contrib/benchmark/lib/xerces-2.9.0.jar
+++ b/contrib/benchmark/lib/xerces-2.9.0.jar
@ -0,0 +1,2 @@
 AnyObjectId[99ee39d5be4f9700474691d8a5ed0a5058e27f7b] was removed in git history.
 Apache SVN contains full history.
--- a/contrib/benchmark/lib/xml-apis-2.9.0.jar
+++ b/contrib/benchmark/lib/xml-apis-2.9.0.jar
@ -0,0 +1,2 @@
 AnyObjectId[d42c0ea6cfd17ed6b444b8337febbc0bdb55ed83] was removed in git history.
 Apache SVN contains full history.
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java
@ -0,0 +1,210 @@
 package org.apache.lucene.benchmark.byTask.feeds;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileFilter;
 import java.io.FileReader;
 import java.text.DateFormat;
 import java.text.SimpleDateFormat;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.Locale;
 import java.util.Stack;
 /**
 * A DocMaker using the Dir collection for its input.
 *
 * Config properties:
 * docs.dir=&lt;path to the docs dir| Default: dir-out&gt;
 *
 */
 public class DirDocMaker extends BasicDocMaker {
  private DateFormat dateFormat;
  private File dataDir = null;
  private int iteration=0;
  static public class Iterator implements java.util.Iterator {
    int count = 0;
    public int getCount(){
      return count;
    }
    Stack stack = new Stack();
    /* this seems silly ... there must be a better way ...
       not that this is good, but can it matter? */
    static class Comparator implements java.util.Comparator {
      public int compare(Object _a, Object _b) {
        String a = _a.toString();
        String b = _b.toString();
        int diff = a.length() - b.length();
        if (diff > 0) {
          while (diff-- > 0) {
            b = "0" + b;
          }
        } else if (diff < 0) {
          diff = -diff;
          while (diff-- > 0) {
            a = "0" + a;
          }
        }
        /* note it's reversed because we're going to push,
           which reverses again */
        return b.compareTo(a);
      }
    }
    Comparator c = new Comparator();
    void push(File[] files) {
      Arrays.sort(files, c);
      for(int i = 0; i < files.length; i++) {
        // System.err.println("push " + files[i]);
        stack.push(files[i]);
      }
    }
    void push(File f) {
      push(f.listFiles(new FileFilter() {
          public boolean accept(File f) { return f.isDirectory(); } }));
      push(f.listFiles(new FileFilter() {
          public boolean accept(File f) { return f.getName().endsWith(".txt"); } }));
      find();
    }
    void find() {
      if (stack.empty()) {
        return;
      }
      if (!((File)stack.peek()).isDirectory()) {
        return;
      }
      File f = (File)stack.pop();
      push(f);
    }
    public Iterator(File f) {
      push(f);
    }
    public void remove() {
      throw new RuntimeException("cannot");
    }
    public boolean hasNext() {
      return stack.size() > 0;
    }
    public Object next() {
      assert hasNext();
      count++;
      Object object = stack.pop();
      // System.err.println("pop " + object);
      find();
      return object;
    }
  }
  private Iterator inputFiles = null;
  /* (non-Javadoc)
   * @see SimpleDocMaker#setConfig(java.util.Properties)
   */
  public void setConfig(Config config) {
    super.setConfig(config);
    String d = config.get("docs.dir", "dir-out");
    dataDir = new File(new File("work"), d);
    inputFiles = new Iterator(dataDir);
    if (inputFiles==null) {
      throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
    }
    // date format: 30-MAR-1987 14:22:36
    dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss",Locale.US);
    dateFormat.setLenient(true);
  }
  protected DocData getNextDocData() throws Exception {
    File f = null;
    String name = null;
    synchronized (this) {
      if (!inputFiles.hasNext()) { 
        // exhausted files, start a new round, unless forever set to false.
        if (!forever) {
          throw new NoMoreDataException();
        }
        inputFiles = new Iterator(dataDir);
        iteration++;
      }
      f = (File) inputFiles.next();
      // System.err.println(f);
      name = f.getCanonicalPath()+"_"+iteration;
    }
    BufferedReader reader = new BufferedReader(new FileReader(f));
    String line = null;
    //First line is the date, 3rd is the title, rest is body
    String dateStr = reader.readLine();
    reader.readLine();//skip an empty line
    String title = reader.readLine();
    reader.readLine();//skip an empty line
    StringBuffer bodyBuf = new StringBuffer(1024);
    while ((line = reader.readLine()) != null) {
      bodyBuf.append(line).append(' ');
    }
    reader.close();
    addBytes(f.length());
    Date date = dateFormat.parse(dateStr.trim()); 
    return new DocData(name, bodyBuf.toString(), title, null, date);
  }
  /*
   *  (non-Javadoc)
   * @see DocMaker#resetIinputs()
   */
  public synchronized void resetInputs() {
    super.resetInputs();
    inputFiles = new Iterator(dataDir);
    iteration = 0;
  }
  /*
   *  (non-Javadoc)
   * @see DocMaker#numUniqueTexts()
   */
  public int numUniqueTexts() {
    return inputFiles.getCount();
  }
 }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
@ -17,19 +17,20 @@ package org.apache.lucene.benchmark.byTask.tasks;
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.store.Directory;
 import java.io.IOException;
 /**
 * Create an index.
 * <br>Other side effects: index writer object in perfRunData is set.
- * <br>Relevant properties: <code>merge.factor , max.buffered</code>.
+ * <br>Relevant properties: <code>merge.factor, max.buffered,
 *  max.field.length</code>.
 */
 public class CreateIndexTask extends PerfTask {
@ -48,10 +49,12 @@ public class CreateIndexTask extends PerfTask {
    boolean cmpnd = config.get("compound",true);
    int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR);
    int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED);
    int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH);
    iw.setUseCompoundFile(cmpnd);
    iw.setMergeFactor(mrgf);
    iw.setMaxBufferedDocs(mxbf);
    iw.setMaxFieldLength(mxfl);
    getRunData().setIndexWriter(iw);
    return 1;
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
@ -17,23 +17,26 @@ package org.apache.lucene.benchmark.byTask.tasks;
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.store.Directory;
 import java.io.IOException;
 /**
 * Open an index writer.
 * <br>Other side effects: index writer object in perfRunData is set.
- * <br>Relevant properties: <code>merge.factor , max.buffered</code>.
+ * <br>Relevant properties: <code>merge.factor, max.buffered,
 * max.field.length</code>.
 </code>.
 */
 public class OpenIndexTask extends PerfTask {
  public static final int DEFAULT_MAX_BUFFERED = 10;
  public static final int DEFAULT_MAX_FIELD_LENGTH = 10000;
  public static final int DEFAULT_MERGE_PFACTOR = 10;
  public OpenIndexTask(PerfRunData runData) {
@ -50,9 +53,11 @@ public class OpenIndexTask extends PerfTask {
    boolean cmpnd = config.get("compound",true);
    int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR);
    int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED);
    int mxfl = config.get("max.field.length",DEFAULT_MAX_FIELD_LENGTH);
    // must update params for newly opened writer
    writer.setMaxBufferedDocs(mxbf);
    writer.setMaxFieldLength(mxfl);
    writer.setMergeFactor(mrgf);
    writer.setUseCompoundFile(cmpnd); // this one redundant?
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java
@ -0,0 +1,211 @@
 package org.apache.lucene.benchmark.utils;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.xml.sax.Attributes;
 import org.xml.sax.InputSource;
 import org.xml.sax.XMLReader;
 import org.xml.sax.helpers.DefaultHandler;
 import org.xml.sax.helpers.XMLReaderFactory;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileWriter;
 import java.io.IOException;
 /**
 * Extract the downloaded Wikipedia dump into separate files for indexing.
 */
 public class ExtractWikipedia {
  private File wikipedia;
  private File outputDir;
  public ExtractWikipedia(File wikipedia, File outputDir) {
    this.wikipedia = wikipedia;
    this.outputDir = outputDir;
    System.out.println("Deleting all files in " + outputDir);
    File [] files = outputDir.listFiles();
    for (int i = 0; i < files.length; i++) {
      files[i].delete();
    }
  }
  static public int count = 0;
  static String[] months = {"JAN", "FEB", "MAR", "APR",
                            "MAY", "JUN", "JUL", "AUG",
                            "SEP", "OCT", "NOV", "DEC"};
  public class Parser extends DefaultHandler {
    public Parser() {
    }
    StringBuffer contents = new StringBuffer();
    public void characters(char[] ch, int start, int length) {
      contents.append(ch, start, length);
    }
    String title;
    String id;
    String body;
    String time;
    static final int BASE = 10;
    public void startElement(String namespace,
                             String simple,
                             String qualified,
                             Attributes attributes) {
      if (qualified.equals("page")) {
        title = null;
        id = null;
        body = null;
        time = null;
      } else if (qualified.equals("text")) {
        contents.setLength(0);
      } else if (qualified.equals("timestamp")) {
        contents.setLength(0);
      } else if (qualified.equals("title")) {
        contents.setLength(0);
      } else if (qualified.equals("id")) {
        contents.setLength(0);
      }
    }
    public File directory (int count, File directory) {
      if (directory == null) {
        directory = outputDir;
      }
      int base = BASE;
      while (base <= count) {
        base *= BASE;
      }
      if (count < BASE) {
        return directory;
      }
      directory = new File (directory, (Integer.toString(base / BASE)));
      directory = new File (directory, (Integer.toString(count / (base / BASE))));
      return directory(count % (base / BASE), directory);
    }
    public void create(String id, String title, String time, String body) {
      File d = directory(count++, null);
      d.mkdirs();
      File f = new File(d, id + ".txt");
      StringBuffer contents = new StringBuffer();
      contents.append(time);
      contents.append("\n\n");
      contents.append(title);
      contents.append("\n\n");
      contents.append(body);
      contents.append("\n");
      try {
        FileWriter writer = new FileWriter(f);
        writer.write(contents.toString());
        writer.close();
      } catch (IOException ioe) {
        throw new RuntimeException(ioe);
      }
    }
    String time(String original) {
      StringBuffer buffer = new StringBuffer();
      buffer.append(original.substring(8, 10));
      buffer.append('-');
      buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
      buffer.append('-');
      buffer.append(original.substring(0, 4));
      buffer.append(' ');
      buffer.append(original.substring(11, 19));
      buffer.append(".000");
      return buffer.toString();
    }
    public void endElement(String namespace, String simple, String qualified) {
      if (qualified.equals("title")) {
        title = contents.toString();
      } else if (qualified.equals("text")) {
        body = contents.toString();
        if (body.startsWith("#REDIRECT") ||
             body.startsWith("#redirect")) {
          body = null;
        }
      } else if (qualified.equals("timestamp")) {
        time = time(contents.toString());
      } else if (qualified.equals("id") && id == null) {
        id = contents.toString();
      } else if (qualified.equals("page")) {
        if (body != null) {
          create(id, title, time, body);
        }
      }
    }
  }
  public void extract() {
    try {
      Parser parser = new Parser();
      if (false) {
        SAXParser sp = SAXParserFactory.newInstance().newSAXParser();
        sp.parse(new FileInputStream(wikipedia), parser);
      } else {
        XMLReader reader =
          XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
        reader.setContentHandler(parser);
        reader.setErrorHandler(parser);
        reader.parse(new InputSource(new FileInputStream(wikipedia)));
      }
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
  public static void main(String[] args) {
    if (args.length != 2) {
      printUsage();
    }
    File wikipedia = new File(args[0]);
    if (wikipedia.exists()) {
      File outputDir = new File(args[1]);
      outputDir.mkdirs();
      ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, outputDir);
      extractor.extract();
    } else {
      printUsage();
    }
  }
  private static void printUsage() {
    System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia <Path to Wikipedia XML file> <Output Path>");
  }
 }
		`@ -0,0 +1,2 @@`
							`AnyObjectId[99ee39d5be4f9700474691d8a5ed0a5058e27f7b] was removed in git history.`
							`Apache SVN contains full history.`
		`@ -0,0 +1,2 @@`
							`AnyObjectId[d42c0ea6cfd17ed6b444b8337febbc0bdb55ed83] was removed in git history.`
							`Apache SVN contains full history.`