From d1422ebd6b1681d173ea1c4184cad143b0ec6fa3 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Thu, 9 Aug 2007 08:57:26 +0000 Subject: [PATCH] LUCENE-971: extract wikipedia documents as a doc maker directly from XML file without using intermediate one-file-per-document git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@564151 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/benchmark/CHANGES.txt | 6 + contrib/benchmark/build.xml | 13 +- contrib/benchmark/conf/extractWikipedia.alg | 44 ++++ .../byTask/feeds/EnwikiDocMaker.java | 213 ++++++++++++++++++ .../benchmark/byTask/feeds/LineDocMaker.java | 19 +- .../byTask/tasks/WriteLineDocTask.java | 5 +- .../benchmark/utils/ExtractWikipedia.java | 211 ----------------- 7 files changed, 279 insertions(+), 232 deletions(-) create mode 100644 contrib/benchmark/conf/extractWikipedia.alg create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java delete mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java diff --git a/contrib/benchmark/CHANGES.txt b/contrib/benchmark/CHANGES.txt index 6035e6ebd24..13c27398010 100644 --- a/contrib/benchmark/CHANGES.txt +++ b/contrib/benchmark/CHANGES.txt @@ -4,6 +4,12 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety $Id:$ +8/9/07 + LUCENE-971: Change enwiki tasks to a doc maker (extending + LineDocMaker) that directly processes the Wikipedia XML and produces + documents. Intermediate files (one per document) are no longer + created. + 8/1/07 LUCENE-967: Add "ReadTokensTask" to allow for benchmarking just tokenization. diff --git a/contrib/benchmark/build.xml b/contrib/benchmark/build.xml index afc59da8762..95f3913aa0b 100644 --- a/contrib/benchmark/build.xml +++ b/contrib/benchmark/build.xml @@ -23,7 +23,7 @@ - + @@ -31,7 +31,6 @@ - @@ -43,14 +42,6 @@ - - - - - - - - @@ -164,7 +155,7 @@ - + diff --git a/contrib/benchmark/conf/extractWikipedia.alg b/contrib/benchmark/conf/extractWikipedia.alg new file mode 100644 index 00000000000..0754e0a374e --- /dev/null +++ b/contrib/benchmark/conf/extractWikipedia.alg @@ -0,0 +1,44 @@ +#/** +# * Licensed to the Apache Software Foundation (ASF) under one or more +# * contributor license agreements. See the NOTICE file distributed with +# * this work for additional information regarding copyright ownership. +# * The ASF licenses this file to You under the Apache License, Version 2.0 +# * (the "License"); you may not use this file except in compliance with +# * the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ +# ------------------------------------------------------------------------------------- + +# +# This alg will process the Wikipedia documents feed to produce a +# single file that contains all documents, one per line. +# +# To use this, first cd to contrib/benchmark and then run: +# +# ant run-task -Dtask.alg=conf/extractWikipedia.alg +# +# Then, to index the documents in the line file, see +# indexLineFile.alg. +# + +# Where to get documents from: +doc.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker +docs.file=temp/enwiki-20070527-pages-articles.xml + +# Where to write the line file output: +line.file.out=work/enwiki.txt + +# Stop after processing the document feed once: +doc.maker.forever=false + +# ------------------------------------------------------------------------------------- + +# Process all documents, appending each one to the line file: +{WriteLineDoc() > : * diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java new file mode 100644 index 00000000000..36510dfad3c --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java @@ -0,0 +1,213 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.xml.sax.XMLReader; +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; +import org.xml.sax.helpers.XMLReaderFactory; + +import java.io.IOException; + +import org.apache.lucene.document.Document; + +/** + * A LineDocMaker which reads the uncompressed english wikipedia dump. + */ +public class EnwikiDocMaker extends LineDocMaker { + + static final int TITLE = 0; + static final int DATE = TITLE+1; + static final int BODY = DATE+1; + static final int LENGTH = BODY+1; + + static final String[] months = {"JAN", "FEB", "MAR", "APR", + "MAY", "JUN", "JUL", "AUG", + "SEP", "OCT", "NOV", "DEC"}; + + class Parser extends DefaultHandler implements Runnable { + + Thread t; + + public void run() { + + try { + XMLReader reader = + XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser"); + reader.setContentHandler(this); + reader.setErrorHandler(this); + while(true){ + InputSource is = new InputSource(fileIS); + reader.parse(is); + if (!forever) { + synchronized(this) { + nmde = new NoMoreDataException(); + notify(); + } + return; + } else { + synchronized(this){ + openFile(); + } + } + } + } catch (SAXException sae) { + throw new RuntimeException(sae); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + + } + + Parser() { + t = new Thread(this); + t.setDaemon(true); + t.start(); + } + + String[] tuple; + NoMoreDataException nmde; + + String[] next() throws NoMoreDataException { + String[] result; + synchronized(this){ + while(tuple == null && nmde == null){ + try { + wait(); + } catch (InterruptedException ie) { + } + } + if (nmde != null) { + throw nmde; + } + result = tuple; + tuple = null; + notify(); + } + return result; + } + + StringBuffer contents = new StringBuffer(); + + public void characters(char[] ch, int start, int length) { + contents.append(ch, start, length); + } + + String title; + String body; + String time; + + static final int BASE = 10; + + public void startElement(String namespace, + String simple, + String qualified, + Attributes attributes) { + if (qualified.equals("page")) { + title = null; + body = null; + time = null; + } else if (qualified.equals("text")) { + contents.setLength(0); + } else if (qualified.equals("timestamp")) { + contents.setLength(0); + } else if (qualified.equals("title")) { + contents.setLength(0); + } + } + + String time(String original) { + StringBuffer buffer = new StringBuffer(); + + buffer.append(original.substring(8, 10)); + buffer.append('-'); + buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]); + buffer.append('-'); + buffer.append(original.substring(0, 4)); + buffer.append(' '); + buffer.append(original.substring(11, 19)); + buffer.append(".000"); + + return buffer.toString(); + } + + public void create(String title, String time, String body) { + String[] t = new String[LENGTH]; + t[TITLE] = title.replace('\t', ' '); + t[DATE] = time.replace('\t', ' '); + t[BODY] = body.replaceAll("[\t\n]", " "); + synchronized(this) { + while(tuple!=null) { + try { + wait(); + } catch (InterruptedException ie) { + } + } + tuple = t; + notify(); + } + } + + public void endElement(String namespace, String simple, String qualified) + throws SAXException { + if (qualified.equals("title")) { + title = contents.toString(); + } else if (qualified.equals("text")) { + body = contents.toString(); + if (body.startsWith("#REDIRECT") || + body.startsWith("#redirect")) { + body = null; + } + } else if (qualified.equals("timestamp")) { + time = time(contents.toString()); + } else if (qualified.equals("page")) { + if (body != null) { + create(title, time, body); + } + } + } + } + + Parser parser = new Parser(); + + class DocState extends LineDocMaker.DocState { + public Document setFields(String[] tuple) { + titleField.setValue(tuple[TITLE]); + dateField.setValue(tuple[DATE]); + bodyField.setValue(tuple[BODY]); + return doc; + } + } + + private DocState getDocState() { + DocState ds = (DocState) docState.get(); + if (ds == null) { + ds = new DocState(); + docState.set(ds); + } + return ds; + } + + public Document makeDocument() throws Exception { + String[] tuple = parser.next(); + return getDocState().setFields(tuple); + } + +} \ No newline at end of file diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java index 0bdc6787626..de36bd4e008 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java @@ -25,7 +25,8 @@ import org.apache.lucene.document.Field; import java.io.BufferedReader; import java.io.IOException; -import java.io.FileReader; +import java.io.FileInputStream; +import java.io.InputStreamReader; /** * A DocMaker reading one line at a time as a Document from @@ -39,13 +40,14 @@ import java.io.FileReader; */ public class LineDocMaker extends BasicDocMaker { - private BufferedReader fileIn; - private ThreadLocal docState = new ThreadLocal(); + FileInputStream fileIS; + BufferedReader fileIn; + ThreadLocal docState = new ThreadLocal(); private String fileName; private static int READER_BUFFER_BYTES = 64*1024; - - private class DocState { + + class DocState { Document doc; Field bodyField; Field titleField; @@ -63,7 +65,7 @@ public class LineDocMaker extends BasicDocMaker { storeVal, Field.Index.TOKENIZED, termVecVal); - dateField = new Field(BasicDocMaker.TITLE_FIELD, + dateField = new Field(BasicDocMaker.DATE_FIELD, "", storeVal, Field.Index.TOKENIZED, @@ -143,11 +145,12 @@ public class LineDocMaker extends BasicDocMaker { openFile(); } - private void openFile() { + void openFile() { try { if (fileIn != null) fileIn.close(); - fileIn = new BufferedReader(new FileReader(fileName), READER_BUFFER_BYTES); + fileIS = new FileInputStream(fileName); + fileIn = new BufferedReader(new InputStreamReader(fileIS,"UTF-8"), READER_BUFFER_BYTES); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java index 97c57c11ee0..05e1664bdd0 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java @@ -18,7 +18,8 @@ package org.apache.lucene.benchmark.byTask.tasks; */ import java.io.BufferedWriter; -import java.io.FileWriter; +import java.io.FileOutputStream; +import java.io.OutputStreamWriter; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; @@ -59,7 +60,7 @@ public class WriteLineDocTask extends PerfTask { String fileName = config.get("line.file.out", null); if (fileName == null) throw new Exception("line.file.out must be set"); - lineFileOut = new BufferedWriter(new FileWriter(fileName)); + lineFileOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName),"UTF-8")); } docMaker = getRunData().getDocMaker(); } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java deleted file mode 100644 index 5b91da731f6..00000000000 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java +++ /dev/null @@ -1,211 +0,0 @@ -package org.apache.lucene.benchmark.utils; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.xml.sax.Attributes; -import org.xml.sax.InputSource; -import org.xml.sax.XMLReader; -import org.xml.sax.helpers.DefaultHandler; -import org.xml.sax.helpers.XMLReaderFactory; - -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileWriter; -import java.io.IOException; - -/** - * Extract the downloaded Wikipedia dump into separate files for indexing. - */ -public class ExtractWikipedia { - - private File wikipedia; - private File outputDir; - - public ExtractWikipedia(File wikipedia, File outputDir) { - this.wikipedia = wikipedia; - this.outputDir = outputDir; - System.out.println("Deleting all files in " + outputDir); - File [] files = outputDir.listFiles(); - for (int i = 0; i < files.length; i++) { - files[i].delete(); - } - } - - static public int count = 0; - static String[] months = {"JAN", "FEB", "MAR", "APR", - "MAY", "JUN", "JUL", "AUG", - "SEP", "OCT", "NOV", "DEC"}; - - public class Parser extends DefaultHandler { - - public Parser() { - } - - StringBuffer contents = new StringBuffer(); - - public void characters(char[] ch, int start, int length) { - contents.append(ch, start, length); - } - - String title; - String id; - String body; - String time; - - static final int BASE = 10; - - public void startElement(String namespace, - String simple, - String qualified, - Attributes attributes) { - if (qualified.equals("page")) { - title = null; - id = null; - body = null; - time = null; - } else if (qualified.equals("text")) { - contents.setLength(0); - } else if (qualified.equals("timestamp")) { - contents.setLength(0); - } else if (qualified.equals("title")) { - contents.setLength(0); - } else if (qualified.equals("id")) { - contents.setLength(0); - } - } - - public File directory (int count, File directory) { - if (directory == null) { - directory = outputDir; - } - int base = BASE; - while (base <= count) { - base *= BASE; - } - if (count < BASE) { - return directory; - } - directory = new File (directory, (Integer.toString(base / BASE))); - directory = new File (directory, (Integer.toString(count / (base / BASE)))); - return directory(count % (base / BASE), directory); - } - - public void create(String id, String title, String time, String body) { - - File d = directory(count++, null); - d.mkdirs(); - File f = new File(d, id + ".txt"); - - StringBuffer contents = new StringBuffer(); - - contents.append(time); - contents.append("\n\n"); - contents.append(title); - contents.append("\n\n"); - contents.append(body); - contents.append("\n"); - - try { - FileWriter writer = new FileWriter(f); - writer.write(contents.toString()); - writer.close(); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - - } - - String time(String original) { - StringBuffer buffer = new StringBuffer(); - - buffer.append(original.substring(8, 10)); - buffer.append('-'); - buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]); - buffer.append('-'); - buffer.append(original.substring(0, 4)); - buffer.append(' '); - buffer.append(original.substring(11, 19)); - buffer.append(".000"); - - return buffer.toString(); - } - - public void endElement(String namespace, String simple, String qualified) { - if (qualified.equals("title")) { - title = contents.toString(); - } else if (qualified.equals("text")) { - body = contents.toString(); - if (body.startsWith("#REDIRECT") || - body.startsWith("#redirect")) { - body = null; - } - } else if (qualified.equals("timestamp")) { - time = time(contents.toString()); - } else if (qualified.equals("id") && id == null) { - id = contents.toString(); - } else if (qualified.equals("page")) { - if (body != null) { - create(id, title, time, body); - } - } - } - } - - public void extract() { - - try { - Parser parser = new Parser(); - if (false) { - SAXParser sp = SAXParserFactory.newInstance().newSAXParser(); - sp.parse(new FileInputStream(wikipedia), parser); - } else { - XMLReader reader = - XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser"); - reader.setContentHandler(parser); - reader.setErrorHandler(parser); - reader.parse(new InputSource(new FileInputStream(wikipedia))); - } - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - public static void main(String[] args) { - if (args.length != 2) { - printUsage(); - } - - File wikipedia = new File(args[0]); - - if (wikipedia.exists()) { - File outputDir = new File(args[1]); - outputDir.mkdirs(); - ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, outputDir); - extractor.extract(); - } else { - printUsage(); - } - } - - private static void printUsage() { - System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia "); - } - -}