LUCENE-971: extract wikipedia documents as a doc maker directly from XML file without using intermediate one-file-per-document

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@564151 13f79535-47bb-0310-9956-ffa450edef68
2007-08-09 08:57:26 +00:00 · 2007-08-09 08:57:26 +00:00 · d1422ebd6b
parent 2d954694dc
commit d1422ebd6b
7 changed files with 279 additions and 232 deletions
--- a/contrib/benchmark/CHANGES.txt
+++ b/contrib/benchmark/CHANGES.txt
@ -4,6 +4,12 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety

 $Id:$

+8/9/07
+  LUCENE-971: Change enwiki tasks to a doc maker (extending
+  LineDocMaker) that directly processes the Wikipedia XML and produces
+  documents.  Intermediate files (one per document) are no longer
+  created.
+
 8/1/07
  LUCENE-967: Add "ReadTokensTask" to allow for benchmarking just tokenization.

--- a/contrib/benchmark/build.xml
+++ b/contrib/benchmark/build.xml
@ -23,7 +23,7 @@
        
        <available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
        <available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
-        <available file="${working.dir}/enwiki" property="enwiki.extracted"/>
+        <available file="${working.dir}/enwiki.txt" property="enwiki.extracted"/>

    </target>

@ -31,7 +31,6 @@
        <mkdir dir="temp"/>
        <antcall target="get-enwiki"/>
        <antcall target="expand-enwiki"/>
-        <antcall target="extract-enwiki"/>
    </target>

    <target name="get-enwiki" unless="enwiki.exists">
@ -43,14 +42,6 @@
        <bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
    </target>

-    <target name="extract-enwiki" depends="check-files" unless="enwiki.extracted">
-        <mkdir dir="${working.dir}/enwiki"/>
-        <java classname="org.apache.lucene.benchmark.utils.ExtractWikipedia" maxmemory="1024M" fork="true">
-            <classpath refid="run.classpath"/>
-            <arg line="temp/enwiki-20070527-pages-articles.xml ${working.dir}/enwiki"/>
-        </java>
-    </target>
-
    <target name="get-news-20" unless="20news-18828.exists">
        <get src="http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz"
             dest="temp/news20.tar.gz"/>
@ -164,7 +155,7 @@
              <enable/>
            </assertions>
            <classpath refid="run.classpath"/>
-            <arg line="conf/wikipedia.alg"/>
+            <arg line="conf/extractWikipedia.alg"/>
        </java>
    </target>

--- a/contrib/benchmark/conf/extractWikipedia.alg
+++ b/contrib/benchmark/conf/extractWikipedia.alg
@ -0,0 +1,44 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements.  See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+
+#
+# This alg will process the Wikipedia documents feed to produce a
+# single file that contains all documents, one per line.
+#
+# To use this, first cd to contrib/benchmark and then run:
+#
+#   ant run-task -Dtask.alg=conf/extractWikipedia.alg
+#
+# Then, to index the documents in the line file, see
+# indexLineFile.alg.
+#
+
+# Where to get documents from:
+doc.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker
+docs.file=temp/enwiki-20070527-pages-articles.xml
+
+# Where to write the line file output:
+line.file.out=work/enwiki.txt
+
+# Stop after processing the document feed once:
+doc.maker.forever=false
+
+# -------------------------------------------------------------------------------------
+
+# Process all documents, appending each one to the line file:
+{WriteLineDoc() > : *
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java
@ -0,0 +1,213 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.xml.sax.XMLReader;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+import org.xml.sax.helpers.XMLReaderFactory;
+
+import java.io.IOException;
+
+import org.apache.lucene.document.Document;
+
+/**
+ * A LineDocMaker which reads the uncompressed english wikipedia dump.
+ */
+public class EnwikiDocMaker extends LineDocMaker {
+
+  static final int TITLE = 0;
+  static final int DATE = TITLE+1;
+  static final int BODY = DATE+1;
+  static final int LENGTH = BODY+1;
+
+  static final String[] months = {"JAN", "FEB", "MAR", "APR",
+                                  "MAY", "JUN", "JUL", "AUG",
+                                  "SEP", "OCT", "NOV", "DEC"};
+
+  class Parser extends DefaultHandler implements Runnable {
+
+    Thread t;
+
+    public void run() {
+
+      try {
+        XMLReader reader =
+          XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
+        reader.setContentHandler(this);
+        reader.setErrorHandler(this);
+        while(true){
+          InputSource is = new InputSource(fileIS);
+          reader.parse(is);
+          if (!forever) {
+            synchronized(this) {
+              nmde = new NoMoreDataException();
+              notify();
+            }
+            return;
+          } else {
+            synchronized(this){
+              openFile();
+            }
+          }
+        }
+      } catch (SAXException sae) {
+        throw new RuntimeException(sae);
+      } catch (IOException ioe) {
+        throw new RuntimeException(ioe);
+      }
+
+    }
+
+    Parser() {
+      t = new Thread(this);
+      t.setDaemon(true);
+      t.start();
+    }
+
+    String[] tuple;
+    NoMoreDataException nmde;
+
+    String[] next() throws NoMoreDataException {
+      String[] result;
+      synchronized(this){
+        while(tuple == null && nmde == null){
+          try {
+            wait();
+          } catch (InterruptedException ie) {
+          }
+        }
+        if (nmde != null) {
+          throw nmde;
+        }
+        result = tuple;
+        tuple = null;
+        notify();
+      }
+      return result;
+    }
+
+    StringBuffer contents = new StringBuffer();
+
+    public void characters(char[] ch, int start, int length) {
+      contents.append(ch, start, length);
+    }
+
+    String title;
+    String body;
+    String time;
+
+    static final int BASE = 10;
+    
+    public void startElement(String namespace,
+                             String simple,
+                             String qualified,
+                             Attributes attributes) {
+      if (qualified.equals("page")) {
+        title = null;
+        body = null;
+        time = null;
+      } else if (qualified.equals("text")) {
+        contents.setLength(0);
+      } else if (qualified.equals("timestamp")) {
+        contents.setLength(0);
+      } else if (qualified.equals("title")) {
+        contents.setLength(0);
+      }
+    }
+
+    String time(String original) {
+      StringBuffer buffer = new StringBuffer();
+
+      buffer.append(original.substring(8, 10));
+      buffer.append('-');
+      buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
+      buffer.append('-');
+      buffer.append(original.substring(0, 4));
+      buffer.append(' ');
+      buffer.append(original.substring(11, 19));
+      buffer.append(".000");
+
+      return buffer.toString();
+    }
+
+    public void create(String title, String time, String body) {
+      String[] t = new String[LENGTH];
+      t[TITLE] = title.replace('\t', ' ');
+      t[DATE] = time.replace('\t', ' ');
+      t[BODY] = body.replaceAll("[\t\n]", " ");
+      synchronized(this) {
+        while(tuple!=null) {
+          try {
+            wait();
+          } catch (InterruptedException ie) {
+          }
+        }
+        tuple = t;
+        notify();
+      }
+    }
+
+    public void endElement(String namespace, String simple, String qualified)
+      throws SAXException {
+      if (qualified.equals("title")) {
+        title = contents.toString();
+      } else if (qualified.equals("text")) {
+        body = contents.toString();
+        if (body.startsWith("#REDIRECT") ||
+             body.startsWith("#redirect")) {
+          body = null;
+        }
+      } else if (qualified.equals("timestamp")) {
+        time = time(contents.toString());
+      } else if (qualified.equals("page")) {
+        if (body != null) {
+          create(title, time, body);
+        }
+      }
+    }
+  }
+
+  Parser parser = new Parser();
+
+  class DocState extends LineDocMaker.DocState {
+    public Document setFields(String[] tuple) {
+      titleField.setValue(tuple[TITLE]);
+      dateField.setValue(tuple[DATE]);
+      bodyField.setValue(tuple[BODY]);
+      return doc;
+    }
+  }
+
+  private DocState getDocState() {
+    DocState ds = (DocState) docState.get();
+    if (ds == null) {
+      ds = new DocState();
+      docState.set(ds);
+    }
+    return ds;
+  }
+
+  public Document makeDocument() throws Exception {
+    String[] tuple = parser.next();
+    return getDocState().setFields(tuple);
+  }
+
+}
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
@ -25,7 +25,8 @@ import org.apache.lucene.document.Field;

 import java.io.BufferedReader;
 import java.io.IOException;
-import java.io.FileReader;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;

 /**
 * A DocMaker reading one line at a time as a Document from
@ -39,13 +40,14 @@ import java.io.FileReader;
 */
 public class LineDocMaker extends BasicDocMaker {

-  private BufferedReader fileIn;
-  private ThreadLocal docState = new ThreadLocal();
+  FileInputStream fileIS;
+  BufferedReader fileIn;
+  ThreadLocal docState = new ThreadLocal();
  private String fileName;

  private static int READER_BUFFER_BYTES = 64*1024;
-
-  private class DocState {
+  
+  class DocState {
    Document doc;
    Field bodyField;
    Field titleField;
@ -63,7 +65,7 @@ public class LineDocMaker extends BasicDocMaker {
                             storeVal,
                             Field.Index.TOKENIZED,
                             termVecVal);
-      dateField = new Field(BasicDocMaker.TITLE_FIELD,
+      dateField = new Field(BasicDocMaker.DATE_FIELD,
                            "",
                            storeVal,
                            Field.Index.TOKENIZED,
@ -143,11 +145,12 @@ public class LineDocMaker extends BasicDocMaker {
    openFile();
  }

-  private void openFile() {
+  void openFile() {
    try {
      if (fileIn != null)
        fileIn.close();
-      fileIn = new BufferedReader(new FileReader(fileName), READER_BUFFER_BYTES);
+      fileIS = new FileInputStream(fileName);
+      fileIn = new BufferedReader(new InputStreamReader(fileIS,"UTF-8"), READER_BUFFER_BYTES);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
@ -18,7 +18,8 @@ package org.apache.lucene.benchmark.byTask.tasks;
 */

 import java.io.BufferedWriter;
-import java.io.FileWriter;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;

 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
@ -59,7 +60,7 @@ public class WriteLineDocTask extends PerfTask {
      String fileName = config.get("line.file.out", null);
      if (fileName == null)
        throw new Exception("line.file.out must be set");
-      lineFileOut = new BufferedWriter(new FileWriter(fileName));
+      lineFileOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName),"UTF-8"));
    }
    docMaker = getRunData().getDocMaker();
  }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java
@ -1,211 +0,0 @@
-package org.apache.lucene.benchmark.utils;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.xml.sax.Attributes;
-import org.xml.sax.InputSource;
-import org.xml.sax.XMLReader;
-import org.xml.sax.helpers.DefaultHandler;
-import org.xml.sax.helpers.XMLReaderFactory;
-
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileWriter;
-import java.io.IOException;
-
-/**
- * Extract the downloaded Wikipedia dump into separate files for indexing.
- */
-public class ExtractWikipedia {
-
-  private File wikipedia;
-  private File outputDir;
-
-  public ExtractWikipedia(File wikipedia, File outputDir) {
-    this.wikipedia = wikipedia;
-    this.outputDir = outputDir;
-    System.out.println("Deleting all files in " + outputDir);
-    File [] files = outputDir.listFiles();
-    for (int i = 0; i < files.length; i++) {
-      files[i].delete();
-    }
-  }
-
-  static public int count = 0;
-  static String[] months = {"JAN", "FEB", "MAR", "APR",
-                            "MAY", "JUN", "JUL", "AUG",
-                            "SEP", "OCT", "NOV", "DEC"};
-
-  public class Parser extends DefaultHandler {
-
-    public Parser() {
-    }
-
-    StringBuffer contents = new StringBuffer();
-
-    public void characters(char[] ch, int start, int length) {
-      contents.append(ch, start, length);
-    }
-
-    String title;
-    String id;
-    String body;
-    String time;
-
-    static final int BASE = 10;
-    
-    public void startElement(String namespace,
-                             String simple,
-                             String qualified,
-                             Attributes attributes) {
-      if (qualified.equals("page")) {
-        title = null;
-        id = null;
-        body = null;
-        time = null;
-      } else if (qualified.equals("text")) {
-        contents.setLength(0);
-      } else if (qualified.equals("timestamp")) {
-        contents.setLength(0);
-      } else if (qualified.equals("title")) {
-        contents.setLength(0);
-      } else if (qualified.equals("id")) {
-        contents.setLength(0);
-      }
-    }
-
-    public File directory (int count, File directory) {
-      if (directory == null) {
-        directory = outputDir;
-      }
-      int base = BASE;
-      while (base <= count) {
-        base *= BASE;
-      }
-      if (count < BASE) {
-        return directory;
-      }
-      directory = new File (directory, (Integer.toString(base / BASE)));
-      directory = new File (directory, (Integer.toString(count / (base / BASE))));
-      return directory(count % (base / BASE), directory);
-    }
-
-    public void create(String id, String title, String time, String body) {
-
-      File d = directory(count++, null);
-      d.mkdirs();
-      File f = new File(d, id + ".txt");
-
-      StringBuffer contents = new StringBuffer();
-      
-      contents.append(time);
-      contents.append("\n\n");
-      contents.append(title);
-      contents.append("\n\n");
-      contents.append(body);
-      contents.append("\n");
-
-      try {
-        FileWriter writer = new FileWriter(f);
-        writer.write(contents.toString());
-        writer.close();
-      } catch (IOException ioe) {
-        throw new RuntimeException(ioe);
-      }
-
-    }
-
-    String time(String original) {
-      StringBuffer buffer = new StringBuffer();
-
-      buffer.append(original.substring(8, 10));
-      buffer.append('-');
-      buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
-      buffer.append('-');
-      buffer.append(original.substring(0, 4));
-      buffer.append(' ');
-      buffer.append(original.substring(11, 19));
-      buffer.append(".000");
-
-      return buffer.toString();
-    }
-
-    public void endElement(String namespace, String simple, String qualified) {
-      if (qualified.equals("title")) {
-        title = contents.toString();
-      } else if (qualified.equals("text")) {
-        body = contents.toString();
-        if (body.startsWith("#REDIRECT") ||
-             body.startsWith("#redirect")) {
-          body = null;
-        }
-      } else if (qualified.equals("timestamp")) {
-        time = time(contents.toString());
-      } else if (qualified.equals("id") && id == null) {
-        id = contents.toString();
-      } else if (qualified.equals("page")) {
-        if (body != null) {
-          create(id, title, time, body);
-        }
-      }
-    }
-  }
-
-  public void extract() {
-
-    try {
-      Parser parser = new Parser();
-      if (false) {
-        SAXParser sp = SAXParserFactory.newInstance().newSAXParser();
-        sp.parse(new FileInputStream(wikipedia), parser);
-      } else {
-        XMLReader reader =
-          XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
-        reader.setContentHandler(parser);
-        reader.setErrorHandler(parser);
-        reader.parse(new InputSource(new FileInputStream(wikipedia)));
-      }
-    } catch (Exception e) {
-      throw new RuntimeException(e);
-    }
-  }
-
-  public static void main(String[] args) {
-    if (args.length != 2) {
-      printUsage();
-    }
-
-    File wikipedia = new File(args[0]);
-
-    if (wikipedia.exists()) {
-      File outputDir = new File(args[1]);
-      outputDir.mkdirs();
-      ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, outputDir);
-      extractor.extract();
-    } else {
-      printUsage();
-    }
-  }
-
-  private static void printUsage() {
-    System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia <Path to Wikipedia XML file> <Output Path>");
-  }
-
-}