LUCENE-1591: add bzip2 compression/decompress to contrib/benchmark

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@765543 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-04-16 09:46:30 +00:00
parent fb8bb98c2a
commit 8c4fff6e21
9 changed files with 622 additions and 352 deletions

View File

@ -50,6 +50,11 @@ New features
a field needs to use a custom Collator. (Steven Rowe via Mike a field needs to use a custom Collator. (Steven Rowe via Mike
McCandless) McCandless)
4. LUCENE-1591: EnWikiDocMaker, LineDocMaker, WriteLineDoc can now
read/write bz2 using Apache commons compress library. This means
you can download the .bz2 export from http://wikipedia.org and
immediately index it. (Shai Erera via Mike McCandless)
Documentation Documentation

View File

@ -100,23 +100,14 @@
<antcall target="expand-reuters"/> <antcall target="expand-reuters"/>
<antcall target="extract-reuters"/> <antcall target="extract-reuters"/>
</target> </target>
<property name="digester.jar" value="commons-digester-1.7.jar"/>
<property name="collections.jar" value="commons-collections-3.1.jar"/>
<property name="logging.jar" value="commons-logging-1.0.4.jar"/>
<property name="bean-utils.jar" value="commons-beanutils-1.7.0.jar"/>
<property name="xercesImpl.jar" value="xerces-2.9.1-patched-XERCESJ-1257.jar"/>
<property name="xml-apis.jar" value="xml-apis-2.9.0.jar"/>
<path id="classpath"> <path id="classpath">
<pathelement path="${common.dir}/build/classes/java"/> <pathelement path="${common.dir}/build/classes/java"/>
<pathelement path="${common.dir}/build/classes/demo"/> <pathelement path="${common.dir}/build/classes/demo"/>
<pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/> <pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
<pathelement path="lib/${digester.jar}"/> <fileset dir="lib">
<pathelement path="lib/${collections.jar}"/> <include name="**/*.jar"/>
<pathelement path="lib/${logging.jar}"/> </fileset>
<pathelement path="lib/${bean-utils.jar}"/>
<pathelement path="lib/${xercesImpl.jar}"/>
<pathelement path="lib/${xml-apis.jar}"/>
</path> </path>
<path id="run.classpath"> <path id="run.classpath">
<path refid="classpath"/> <path refid="classpath"/>

View File

@ -17,49 +17,75 @@ package org.apache.lucene.benchmark.byTask.feeds;
* limitations under the License. * limitations under the License.
*/ */
import org.xml.sax.XMLReader; import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
import org.xml.sax.Attributes; import org.xml.sax.Attributes;
import org.xml.sax.InputSource; import org.xml.sax.InputSource;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory; import org.xml.sax.helpers.XMLReaderFactory;
import java.io.IOException;
import java.io.FileInputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.benchmark.byTask.utils.Config;
/** /**
* A LineDocMaker which reads the uncompressed english wikipedia dump. * A {@link LineDocMaker} which reads the english wikipedia
* * dump. You can read the .bz2 file directly (it will be
* decompressed on the fly).
* Config properties: * Config properties:
* keep.image.only.docs=false|true * <ul>
* <br/> * <li>keep.image.only.docs=false|true
* Plus those available in LineDocMaker * <li>[those available in {@link LineDocMaker}]
* * </ul>
* *
* @see org.apache.lucene.benchmark.byTask.feeds.LineDocMaker * @see org.apache.lucene.benchmark.byTask.feeds.LineDocMaker
*/ */
public class EnwikiDocMaker extends LineDocMaker { public class EnwikiDocMaker extends LineDocMaker {
protected boolean keepImages = true;
private static final Map ELEMENTS = new HashMap();
static final int TITLE = 0; static final int TITLE = 0;
static final int DATE = TITLE + 1; static final int DATE = TITLE + 1;
static final int BODY = DATE + 1; static final int BODY = DATE + 1;
static final int ID = BODY + 1; static final int ID = BODY + 1;
static final int LENGTH = ID + 1; static final int LENGTH = ID + 1;
// LENGTH is used as the size of the tuple, so whatever constants we need that
// should not be part of the tuple, we should define them after LENGTH.
static final int PAGE = LENGTH + 1;
static final String[] months = {"JAN", "FEB", "MAR", "APR", static final String[] months = {"JAN", "FEB", "MAR", "APR",
"MAY", "JUN", "JUL", "AUG", "MAY", "JUN", "JUL", "AUG",
"SEP", "OCT", "NOV", "DEC"}; "SEP", "OCT", "NOV", "DEC"};
static {
ELEMENTS.put("page", new Integer(PAGE));
ELEMENTS.put("text", new Integer(BODY));
ELEMENTS.put("timestamp", new Integer(DATE));
ELEMENTS.put("title", new Integer(TITLE));
ELEMENTS.put("id", new Integer(ID));
}
/**
* Returns the type of the element if defined, otherwise returns -1. This
* method is useful in startElement and endElement, by not needing to compare
* the element qualified name over and over.
*/
private final static int getElementType(String elem) {
Integer val = (Integer) ELEMENTS.get(elem);
return val == null ? -1 : val.intValue();
}
protected boolean keepImages = true;
public void setConfig(Config config) { public void setConfig(Config config) {
super.setConfig(config); super.setConfig(config);
keepImages = config.get("keep.image.only.docs", true); keepImages = config.get("keep.image.only.docs", true);
} }
class Parser extends DefaultHandler implements Runnable { class Parser extends DefaultHandler implements Runnable {
Thread t; Thread t;
boolean threadDone; boolean threadDone;
@ -71,7 +97,7 @@ public class EnwikiDocMaker extends LineDocMaker {
reader.setContentHandler(this); reader.setContentHandler(this);
reader.setErrorHandler(this); reader.setErrorHandler(this);
while(true){ while(true){
final FileInputStream localFileIS = fileIS; final InputStream localFileIS = fileIS;
try { try {
InputSource is = new InputSource(localFileIS); InputSource is = new InputSource(localFileIS);
reader.parse(is); reader.parse(is);
@ -133,12 +159,13 @@ public class EnwikiDocMaker extends LineDocMaker {
t = null; t = null;
throw nmde; throw nmde;
} }
if (t != null && threadDone) if (t != null && threadDone) {
// The thread has exited yet did not hit end of // The thread has exited yet did not hit end of
// data, so this means it hit an exception. We // data, so this means it hit an exception. We
// throw NoMorDataException here to force // throw NoMorDataException here to force
// benchmark to stop the current alg: // benchmark to stop the current alg:
throw new NoMoreDataException(); throw new NoMoreDataException();
}
result = tuple; result = tuple;
tuple = null; tuple = null;
notify(); notify();
@ -157,25 +184,27 @@ public class EnwikiDocMaker extends LineDocMaker {
String time; String time;
String id; String id;
public void startElement(String namespace, public void startElement(String namespace,
String simple, String simple,
String qualified, String qualified,
Attributes attributes) { Attributes attributes) {
if (qualified.equals("page")) { int elemType = getElementType(qualified);
switch (elemType) {
case PAGE:
title = null; title = null;
body = null; body = null;
time = null; time = null;
id = null; id = null;
} else if (qualified.equals("text")) { break;
contents.setLength(0); // intentional fall-through.
} else if (qualified.equals("timestamp")) { case BODY:
contents.setLength(0); case DATE:
} else if (qualified.equals("title")) { case TITLE:
contents.setLength(0); case ID:
} else if (qualified.equals("id")) {
contents.setLength(0); contents.setLength(0);
break;
default:
// this element should be discarded.
} }
} }
@ -214,25 +243,34 @@ public class EnwikiDocMaker extends LineDocMaker {
public void endElement(String namespace, String simple, String qualified) public void endElement(String namespace, String simple, String qualified)
throws SAXException { throws SAXException {
if (qualified.equals("title")) { int elemType = getElementType(qualified);
title = contents.toString(); switch (elemType) {
} else if (qualified.equals("text")) { case PAGE:
// the body must be null and we either are keeping image docs or the
// title does not start with Image:
if (body != null && (keepImages || !title.startsWith("Image:"))) {
create(title, time, body, id);
}
break;
case BODY:
body = contents.toString(); body = contents.toString();
//workaround that startswith doesn't have an ignore case option, get at least 20 chars. //workaround that startswith doesn't have an ignore case option, get at least 20 chars.
String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(); String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
if (startsWith.startsWith("#redirect")) { if (startsWith.startsWith("#redirect")) {
body = null; body = null;
} }
} else if (qualified.equals("timestamp")) { break;
case DATE:
time = time(contents.toString()); time = time(contents.toString());
} else if (qualified.equals("id") && id == null) {//just get the first id break;
case TITLE:
title = contents.toString();
break;
case ID:
id = contents.toString(); id = contents.toString();
} break;
else if (qualified.equals("page")) { default:
//the body must be null and we either are keeping image docs or the title does not start with Image: // this element should be discarded.
if (body != null && (keepImages == true || title.startsWith("Image:") == false)) {
create(title, time, body, id);
}
} }
} }
} }

View File

@ -17,38 +17,44 @@ package org.apache.lucene.benchmark.byTask.feeds;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.benchmark.byTask.utils.Config; import java.io.BufferedInputStream;
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.IOException;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.util.Random; import java.util.Random;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
/** /**
* A DocMaker reading one line at a time as a Document from * A DocMaker reading one line at a time as a Document from a single file. This
* a single file. This saves IO cost (over DirDocMaker) of * saves IO cost (over DirDocMaker) of recursing through a directory and opening
* recursing through a directory and opening a new file for * a new file for every document. It also re-uses its Document and Field
* every document. It also re-uses its Document and Field * instance to improve indexing speed.<br>
* instance to improve indexing speed. * The expected format of each line is (arguments are separated by &lt;TAB&gt;):
* <i>title, date, body</i>. If a line is read in a different format, a
* {@link RuntimeException} will be thrown. In general, you should use this doc
* maker with files that were created with {@link WriteLineDocTask}.<br><br>
* *
* Config properties: * Config properties:
* docs.file=&lt;path to the file%gt; * <ul>
* doc.reuse.fields=true|false (default true) * <li>docs.file=&lt;path to the file&gt;
* doc.random.id.limit=N (default -1) -- create random * <li>doc.reuse.fields=true|false (default true)
* docid in the range 0..N; this is useful * <li>bzip.compression=true|false (default false)
* with UpdateDoc to test updating random documents; if * <li>doc.random.id.limit=N (default -1) -- create random docid in the range
* this is unspecified or -1, then docid is sequentially * 0..N; this is useful with UpdateDoc to test updating random documents; if
* assigned * this is unspecified or -1, then docid is sequentially assigned
* </ul>
*/ */
public class LineDocMaker extends BasicDocMaker { public class LineDocMaker extends BasicDocMaker {
FileInputStream fileIS; InputStream fileIS;
BufferedReader fileIn; BufferedReader fileIn;
ThreadLocal docState = new ThreadLocal(); ThreadLocal docState = new ThreadLocal();
private String fileName; private String fileName;
@ -57,9 +63,12 @@ public class LineDocMaker extends BasicDocMaker {
private final DocState localDocState = new DocState(); private final DocState localDocState = new DocState();
private boolean doReuseFields = true; private boolean doReuseFields = true;
private boolean bzipCompressionEnabled = false;
private Random r; private Random r;
private int numDocs; private int numDocs;
private CompressorStreamFactory csFactory = new CompressorStreamFactory();
class DocState { class DocState {
Document doc; Document doc;
Field bodyField; Field bodyField;
@ -93,7 +102,7 @@ public class LineDocMaker extends BasicDocMaker {
doc.add(idField); doc.add(idField);
} }
final static String SEP = WriteLineDocTask.SEP; final static char SEP = WriteLineDocTask.SEP;
private int numDocsCreated; private int numDocsCreated;
private synchronized int incrNumDocsCreated() { private synchronized int incrNumDocsCreated() {
@ -101,27 +110,20 @@ public class LineDocMaker extends BasicDocMaker {
} }
public Document setFields(String line) { public Document setFields(String line) {
// A line must be in the following format. If it's not, fail !
// title <TAB> date <TAB> body <NEWLINE> // title <TAB> date <TAB> body <NEWLINE>
final String title, date, body;
int spot = line.indexOf(SEP); int spot = line.indexOf(SEP);
if (spot != -1) { if (spot == -1) {
title = line.substring(0, spot); throw new RuntimeException("line: [" + line + "] is in an invalid format !");
int spot2 = line.indexOf(SEP, 1+spot);
if (spot2 != -1) {
date = line.substring(1+spot, spot2);
body = line.substring(1+spot2, line.length());
} else
date = body = "";
} else
title = date = body = "";
final String docID;
if (r != null) {
docID = "doc" + r.nextInt(numDocs);
} else {
docID = "doc" + incrNumDocsCreated();
} }
int spot2 = line.indexOf(SEP, 1 + spot);
if (spot2 == -1) {
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
}
final String title = line.substring(0, spot);
final String date = line.substring(1+spot, spot2);
final String body = line.substring(1+spot2, line.length());
final String docID = "doc" + (r != null ? r.nextInt(numDocs) : incrNumDocsCreated());
if (doReuseFields) { if (doReuseFields) {
idField.setValue(docID); idField.setValue(docID);
@ -130,7 +132,10 @@ public class LineDocMaker extends BasicDocMaker {
bodyField.setValue(body); bodyField.setValue(body);
return doc; return doc;
} else { } else {
Field localIDField = new Field(BasicDocMaker.ID_FIELD, docID, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); Field localIDField = new Field(BasicDocMaker.ID_FIELD,
docID,
Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS);
Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD, Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD,
title, title,
@ -174,16 +179,14 @@ public class LineDocMaker extends BasicDocMaker {
String line; String line;
synchronized(this) { synchronized(this) {
while(true) {
line = fileIn.readLine(); line = fileIn.readLine();
if (line == null) { if (line == null) {
if (!forever) {
throw new NoMoreDataException();
}
// Reset the file // Reset the file
openFile(); openFile();
if (!forever) return makeDocument();
throw new NoMoreDataException();
} else {
break;
}
} }
} }
@ -199,15 +202,24 @@ public class LineDocMaker extends BasicDocMaker {
public synchronized void resetInputs() { public synchronized void resetInputs() {
super.resetInputs(); super.resetInputs();
fileName = config.get("docs.file", null);
if (fileName == null)
throw new RuntimeException("docs.file must be set");
openFile(); openFile();
} }
public void setConfig(Config config) { public void setConfig(Config config) {
super.setConfig(config); super.setConfig(config);
fileName = config.get("docs.file", null);
if (fileName == null) {
throw new IllegalArgumentException("docs.file must be set");
}
doReuseFields = config.get("doc.reuse.fields", true); doReuseFields = config.get("doc.reuse.fields", true);
String doBZCompress = config.get("bzip.compression", null);
if (doBZCompress != null) {
// Property was set, use the value.
bzipCompressionEnabled = Boolean.valueOf(doBZCompress).booleanValue();
} else {
// Property was not set, attempt to detect based on file's extension
bzipCompressionEnabled = fileName.endsWith("bz2");
}
numDocs = config.get("doc.random.id.limit", -1); numDocs = config.get("doc.random.id.limit", -1);
if (numDocs != -1) { if (numDocs != -1) {
r = new Random(179); r = new Random(179);
@ -216,16 +228,35 @@ public class LineDocMaker extends BasicDocMaker {
synchronized void openFile() { synchronized void openFile() {
try { try {
if (fileIn != null) if (fileIn != null) {
fileIn.close(); fileIn.close();
}
fileIS = new FileInputStream(fileName); fileIS = new FileInputStream(fileName);
if (bzipCompressionEnabled) {
// According to BZip2CompressorInputStream's code, it reads the first
// two file header chars ('B' and 'Z'). We only need to wrap the
// underlying stream with a BufferedInputStream, since the code uses
// the read() method exclusively.
fileIS = new BufferedInputStream(fileIS, READER_BUFFER_BYTES);
fileIS = csFactory.createCompressorInputStream("bzip2", fileIS);
}
// Wrap the stream with a BufferedReader for several reasons:
// 1. We need the readLine() method.
// 2. Even if bzip.compression is enabled, and is wrapped with
// BufferedInputStream, wrapping with a buffer can still improve
// performance, since the BIS buffer will be used to read from the
// compressed stream, while the BR buffer will be used to read from the
// uncompressed stream.
fileIn = new BufferedReader(new InputStreamReader(fileIS, "UTF-8"), READER_BUFFER_BYTES); fileIn = new BufferedReader(new InputStreamReader(fileIS, "UTF-8"), READER_BUFFER_BYTES);
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} catch (CompressorException e) {
throw new RuntimeException(e);
} }
} }
public int numUniqueTexts() { public int numUniqueTexts() {
return -1; return -1;
} }
} }

View File

@ -17,18 +17,39 @@ package org.apache.lucene.benchmark.byTask.tasks;
* limitations under the License. * limitations under the License.
*/ */
import java.io.BufferedOutputStream;
import java.io.BufferedWriter; import java.io.BufferedWriter;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter; import java.io.OutputStreamWriter;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker; import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
/**
* A task which writes documents, one line per document. Each line is in the
* following format: title &lt;TAB&gt; date &lt;TAB&gt; body. The output of this
* taske can be consumed by
* {@link org.apache.lucene.benchmark.byTask.feeds.LineDocMaker} and is intended
* to save the IO overhead of opening a file per doument to be indexed.<br>
*
* Supports the following parameters:
* <ul>
* <li>line.file.out - the name of the file to write the output to. That
* parameter is mandatory. <b>NOTE:</b> the file is re-created.
* <li>bzip.compression - whether the output should be bzip-compressed. This is
* recommended when the output file is expected to be large. (optional, default:
* false).
* <li>doc.writeline.log.step - controls how many records to process before
* logging the status of the task. <b>NOTE:</b> to disable logging, set this
* value to 0 or negative. (optional, default:1000).
* </ul>
*/
public class WriteLineDocTask extends PerfTask { public class WriteLineDocTask extends PerfTask {
/** /**
@ -36,10 +57,7 @@ public class WriteLineDocTask extends PerfTask {
* an "added N docs" message should be logged. * an "added N docs" message should be logged.
*/ */
public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000; public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000;
public final static char SEP = '\t';
public WriteLineDocTask(PerfRunData runData) {
super(runData);
}
private int logStep = -1; private int logStep = -1;
private int docSize = 0; private int docSize = 0;
@ -47,22 +65,40 @@ public class WriteLineDocTask extends PerfTask {
private BufferedWriter lineFileOut = null; private BufferedWriter lineFileOut = null;
private DocMaker docMaker; private DocMaker docMaker;
public final static String SEP = "\t"; public WriteLineDocTask(PerfRunData runData) throws Exception {
super(runData);
/* Config config = runData.getConfig();
* (non-Javadoc)
* @see PerfTask#setup()
*/
public void setup() throws Exception {
super.setup();
if (lineFileOut==null) {
Config config = getRunData().getConfig();
String fileName = config.get("line.file.out", null); String fileName = config.get("line.file.out", null);
if (fileName == null) if (fileName == null) {
throw new Exception("line.file.out must be set"); throw new IllegalArgumentException("line.file.out must be set");
lineFileOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName),"UTF-8")); }
OutputStream out = new FileOutputStream(fileName);
boolean doBzipCompression = false;
String doBZCompress = config.get("bzip.compression", null);
if (doBZCompress != null) {
// Property was set, use the value.
doBzipCompression = Boolean.valueOf(doBZCompress).booleanValue();
} else {
// Property was not set, attempt to detect based on file's extension
doBzipCompression = fileName.endsWith("bz2");
}
if (doBzipCompression) {
// Wrap with BOS since BZip2CompressorOutputStream calls out.write(int)
// and does not use the write(byte[]) version. This proved to speed the
// compression process by 70% !
out = new BufferedOutputStream(out, 1 << 16);
out = new CompressorStreamFactory().createCompressorOutputStream("bzip2", out);
}
lineFileOut = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16);
docMaker = runData.getDocMaker();
logStep = config.get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
// To avoid the check 'if (logStep > 0)' in log(). This effectively turns
// logging off.
if (logStep <= 0) {
logStep = Integer.MAX_VALUE;
} }
docMaker = getRunData().getDocMaker();
} }
public void tearDown() throws Exception { public void tearDown() throws Exception {
@ -71,53 +107,42 @@ public class WriteLineDocTask extends PerfTask {
} }
public int doLogic() throws Exception { public int doLogic() throws Exception {
Document doc; Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument();
if (docSize > 0) {
doc = docMaker.makeDocument(docSize);
} else {
doc = docMaker.makeDocument();
}
Field f = doc.getField(BasicDocMaker.BODY_FIELD); Field f = doc.getField(BasicDocMaker.BODY_FIELD);
String body = f != null ? f.stringValue().replace('\t', ' ') : null;
String body, title, date;
if (f != null)
body = f.stringValue().replace('\t', ' ');
else
body = null;
f = doc.getField(BasicDocMaker.TITLE_FIELD);
if (f != null)
title = f.stringValue().replace('\t', ' ');
else
title = "";
f = doc.getField(BasicDocMaker.DATE_FIELD);
if (f != null)
date = f.stringValue().replace('\t', ' ');
else
date = "";
if (body != null) { if (body != null) {
f = doc.getField(BasicDocMaker.TITLE_FIELD);
String title = f != null ? f.stringValue().replace('\t', ' ') : "";
f = doc.getField(BasicDocMaker.DATE_FIELD);
String date = f != null ? f.stringValue().replace('\t', ' ') : "";
lineFileOut.write(title, 0, title.length()); lineFileOut.write(title, 0, title.length());
lineFileOut.write(SEP); lineFileOut.write(SEP);
lineFileOut.write(date, 0, date.length()); lineFileOut.write(date, 0, date.length());
lineFileOut.write(SEP); lineFileOut.write(SEP);
lineFileOut.write(body, 0, body.length()); lineFileOut.write(body, 0, body.length());
lineFileOut.newLine(); lineFileOut.newLine();
lineFileOut.flush();
} }
return 1; return 1;
} }
private void log(int count) { private void log(int count) {
if (logStep<0) { // logStep is initialized in the ctor to a positive value. If the config
// init once per instance // file indicates no logging, or contains an invalid value, logStep is init
logStep = getRunData().getConfig().get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP); // to Integer.MAX_VALUE, so that logging will not occur (at least for the
// first Integer.MAX_VALUE records).
if (count % logStep == 0) {
System.out.println("--> " + Thread.currentThread().getName()
+ " processed (write line) " + count + " docs");
} }
if (logStep>0 && (count%logStep)==0) {
System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+" docs");
} }
public void close() throws Exception {
lineFileOut.close();
super.close();
} }
/** /**
@ -125,7 +150,9 @@ public class WriteLineDocTask extends PerfTask {
* @param params docSize, or 0 for no limit. * @param params docSize, or 0 for no limit.
*/ */
public void setParams(String params) { public void setParams(String params) {
if (super.supportsParams()) {
super.setParams(params); super.setParams(params);
}
docSize = (int) Float.parseFloat(params); docSize = (int) Float.parseFloat(params);
} }
@ -135,4 +162,5 @@ public class WriteLineDocTask extends PerfTask {
public boolean supportsParams() { public boolean supportsParams() {
return true; return true;
} }
} }

View File

@ -0,0 +1,38 @@
package org.apache.lucene.benchmark;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import junit.framework.TestCase;
/** Base class for all Benchmark unit tests. */
public class BenchmarkTestCase extends TestCase {
private static final File workDir;
static {
workDir = new File(System.getProperty("benchmark.work.dir", "test/benchmark")).getAbsoluteFile();
workDir.mkdirs();
}
public File getWorkDir() {
return workDir;
}
}

View File

@ -17,23 +17,19 @@
package org.apache.lucene.benchmark.byTask; package org.apache.lucene.benchmark.byTask;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import junit.framework.TestCase; import junit.framework.TestCase;
import org.apache.lucene.benchmark.byTask.tasks.PerfTask; import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
import org.apache.lucene.benchmark.byTask.tasks.TaskSequence; import org.apache.lucene.benchmark.byTask.tasks.TaskSequence;
import org.apache.lucene.benchmark.byTask.utils.Algorithm; import org.apache.lucene.benchmark.byTask.utils.Algorithm;
import java.io.File; /** Test very simply that perf tasks are parses as expected. */
import java.io.StringReader;
import java.lang.reflect.Modifier;
import java.util.ArrayList;
import java.util.Iterator;
/**
* Test very simply that perf tasks are parses as expected.
*/
public class TestPerfTasksParse extends TestCase { public class TestPerfTasksParse extends TestCase {
private static final boolean DEBUG = false;
static final String NEW_LINE = System.getProperty("line.separator"); static final String NEW_LINE = System.getProperty("line.separator");
static final String INDENT = " "; static final String INDENT = " ";
@ -43,162 +39,11 @@ public class TestPerfTasksParse extends TestCase {
INDENT + "print.props=false" + NEW_LINE INDENT + "print.props=false" + NEW_LINE
; ;
/*
* All known tasks.
* As new tasks are added, add them here.
* It would be nice to do that automatically, unfortunately
* Java does not provide a "get all classes in package" or
* "get all sub-classes" functionality.
*/
static String singleTaskAlgs [];
/* (non-Javadoc)
* @see junit.framework.TestCase#setUp()
*/
protected void setUp() throws Exception {
super.setUp();
if (singleTaskAlgs==null) {
singleTaskAlgs = findTasks();
}
}
// one time initialization
static String [] findTasks () throws Exception {
ArrayList tsks = new ArrayList();
// init with tasks we know about
tsks.add( " AddDoc " );
tsks.add( " AddDoc(1000.0) " );
tsks.add( " ClearStats " );
tsks.add( " CloseIndex " );
tsks.add( " CloseReader " );
tsks.add( " CreateIndex " );
tsks.add( " DeleteDoc " );
tsks.add( " DeleteDoc(500.0) " );
tsks.add( " NewRound " );
tsks.add( " OpenIndex " );
tsks.add( " OpenReader " );
tsks.add( " Optimize " );
tsks.add( " RepAll " );
tsks.add( " RepSelectByPref prefix " );
tsks.add( " RepSumByNameRound " );
tsks.add( " RepSumByName " );
tsks.add( " RepSumByPrefRound prefix " );
tsks.add( " RepSumByPref prefix " );
tsks.add( " ResetInputs " );
tsks.add( " ResetSystemErase " );
tsks.add( " ResetSystemSoft " );
tsks.add( " Search " );
tsks.add( " SearchTravRet " );
tsks.add( " SearchTravRet(100.0) " );
tsks.add( " SearchTrav " );
tsks.add( " SearchTrav(50.0) " );
tsks.add( " SetProp " );
tsks.add( " SetProp(name,value) " );
tsks.add( " Warm " );
tsks.add( "SearchTravRetLoadFieldSelector");
tsks.add("SearchTravRetLoadFieldSelector(body,title)");
// if tasks.dir property is defined, look for additional tasks.
// this somewhat covers tasks that would be added in the future, in case
// the list above is not updated to cover them.
// some tasks would be tested more than once this way, but that's ok.
String tasksDir = System.getProperty("tasks.dir");
if (tasksDir !=null) {
String pkgPrefix = PerfTask.class.getPackage().getName()+".";
String taskNames[] = new File(tasksDir).list();
for (int i = 0; i < taskNames.length; i++) {
String name = taskNames[i].trim();
if (!name.endsWith("Task.class"))
continue; // Task class file only
name = name.substring(0,name.length()-6);
Class cls = Class.forName(pkgPrefix+name);
if (Modifier.isAbstract(cls.getModifiers()) || Modifier.isInterface(cls.getModifiers()))
continue; // skip sbstract classes
if (!PerfTask.class.isAssignableFrom(cls))
continue; // not a task
name = name.substring(0,name.length()-4);
if (name.startsWith("Rep") && name.indexOf("Pref")>=0)
name += " prefix";
tsks.add(" "+name+" ");
}
}
return (String[]) tsks.toArray(new String[0]);
}
/**
* @param name test name
*/
public TestPerfTasksParse(String name) { public TestPerfTasksParse(String name) {
super(name); super(name);
} }
/** /** Test the repetiotion parsing for parallel tasks */
* Test the parsing of very simple tasks, for all tasks
*/
public void testAllTasksSimpleParse() {
doTestAllTasksSimpleParse(false,false);
}
/**
* Test the parsing of simple sequential sequences, for all tasks
*/
public void testAllTasksSimpleParseSequntial() {
doTestAllTasksSimpleParse(true,false);
}
/**
* Test the parsing of simple parallel sequences, for all tasks
*/
public void testAllTasksSimpleParseParallel() {
doTestAllTasksSimpleParse(true,true);
}
// utility for simple parsing testing of all tasks.
private void doTestAllTasksSimpleParse(boolean parOrSeq, boolean par) {
for (int i = 0; i < singleTaskAlgs.length; i++) {
String testedTask = singleTaskAlgs[i];
if (parOrSeq) {
if (par) {
testedTask = "[ " + testedTask + " ] : 2";
} else {
testedTask = "{ " + testedTask + " } : 3";
}
}
try {
String algText = propPart+INDENT+testedTask;
logTstParsing(algText);
Benchmark benchmark = new Benchmark(new StringReader(algText));
Algorithm alg = benchmark.getAlgorithm();
ArrayList algTasks = alg.extractTasks();
// must find a task with this name in the algorithm
boolean foundName = false;
boolean foundPar = false;
String theTask = singleTaskAlgs[i].replaceAll(" +"," ").trim();
for (Iterator iter = algTasks.iterator(); iter.hasNext();) {
PerfTask task = (PerfTask) iter.next();
foundName |= (task.toString().indexOf(theTask)>=0);
foundPar |= (task instanceof TaskSequence && ((TaskSequence)task).isParallel());
}
assertTrue("Task "+testedTask+" was not found in "+alg.toString(),foundName);
if (parOrSeq) {
if (par) {
assertTrue("Task "+testedTask+" was supposed to be parallel in "+alg.toString(),foundPar);
} else {
assertFalse("Task "+testedTask+" was not supposed to be parallel in "+alg.toString(),foundPar);
}
}
} catch (Exception e) {
System.out.flush();
e.printStackTrace();
fail(e.getMessage());
}
}
}
/**
* Test the repetiotion parsing for parallel tasks
*/
public void testParseParallelTaskSequenceRepetition() throws Exception { public void testParseParallelTaskSequenceRepetition() throws Exception {
String taskStr = "AddDoc"; String taskStr = "AddDoc";
String parsedTasks = "[ "+taskStr+" ] : 1000"; String parsedTasks = "[ "+taskStr+" ] : 1000";
@ -219,9 +64,7 @@ public class TestPerfTasksParse extends TestCase {
} }
} }
/** /** Test the repetiotion parsing for sequential tasks */
* Test the repetiotion parsing for sequential tasks
*/
public void testParseTaskSequenceRepetition() throws Exception { public void testParseTaskSequenceRepetition() throws Exception {
String taskStr = "AddDoc"; String taskStr = "AddDoc";
String parsedTasks = "{ "+taskStr+" } : 1000"; String parsedTasks = "{ "+taskStr+" } : 1000";
@ -242,11 +85,4 @@ public class TestPerfTasksParse extends TestCase {
} }
} }
private void logTstParsing (String txt) {
if (!DEBUG)
return;
System.out.println("Test parsing of");
System.out.println(txt);
}
} }

View File

@ -0,0 +1,169 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.Properties;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.tasks.AddDocTask;
import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask;
import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask;
import org.apache.lucene.benchmark.byTask.tasks.TaskSequence;
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
/** Tests the functionality of {@link LineDocMaker}. */
public class LineDocMakerTest extends BenchmarkTestCase {
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
private void createBZ2LineFile(File file) throws Exception {
OutputStream out = new FileOutputStream(file);
out = csFactory.createCompressorOutputStream("bzip2", out);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
StringBuffer doc = new StringBuffer();
doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
writer.write(doc.toString());
writer.newLine();
writer.close();
}
private void createRegularLineFile(File file) throws Exception {
OutputStream out = new FileOutputStream(file);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
StringBuffer doc = new StringBuffer();
doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
writer.write(doc.toString());
writer.newLine();
writer.close();
}
private void doIndexAndSearchTest(File file, boolean setBZCompress,
String bz2CompressVal) throws Exception {
Properties props = new Properties();
// LineDocMaker specific settings.
props.setProperty("docs.file", file.getAbsolutePath());
if (setBZCompress) {
props.setProperty("bzip.compression", bz2CompressVal);
}
// Indexing configuration.
props.setProperty("analyzer", SimpleAnalyzer.class.getName());
props.setProperty("doc.maker", LineDocMaker.class.getName());
props.setProperty("directory", "RAMDirectory");
// Create PerfRunData
Config config = new Config(props);
PerfRunData runData = new PerfRunData(config);
TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false);
tasks.addTask(new CreateIndexTask(runData));
tasks.addTask(new AddDocTask(runData));
tasks.addTask(new CloseIndexTask(runData));
tasks.doLogic();
IndexSearcher searcher = new IndexSearcher(runData.getDirectory());
TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10);
assertEquals(1, td.totalHits);
assertNotNull(td.scoreDocs[0]);
searcher.close();
}
/* Tests LineDocMaker with a bzip2 input stream. */
public void testBZip2() throws Exception {
File file = new File(getWorkDir(), "one-line.bz2");
createBZ2LineFile(file);
doIndexAndSearchTest(file, true, "true");
}
public void testBZip2AutoDetect() throws Exception {
File file = new File(getWorkDir(), "one-line.bz2");
createBZ2LineFile(file);
doIndexAndSearchTest(file, false, null);
}
public void testBZip2WithBzipCompressionDisabled() throws Exception {
File file = new File(getWorkDir(), "one-line.bz2");
createBZ2LineFile(file);
try {
doIndexAndSearchTest(file, true, "false");
fail("Some exception should have been thrown !");
} catch (Exception e) {
// expected.
}
}
public void testRegularFile() throws Exception {
File file = new File(getWorkDir(), "one-line");
createRegularLineFile(file);
doIndexAndSearchTest(file, false, null);
}
public void testRegularFileWithBZipCompressionEnabled() throws Exception {
File file = new File(getWorkDir(), "one-line");
createRegularLineFile(file);
try {
doIndexAndSearchTest(file, true, "true");
fail("Some exception should have been thrown !");
} catch (Exception e) {
// expected.
}
}
public void testInvalidFormat() throws Exception {
String[] testCases = new String[] {
"", // empty line
"title", // just title
"title" + WriteLineDocTask.SEP, // title + SEP
"title" + WriteLineDocTask.SEP + "body", // title + SEP + body
// note that title + SEP + body + SEP is a valid line, which results in an
// empty body
};
for (int i = 0; i < testCases.length; i++) {
File file = new File(getWorkDir(), "one-line");
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8"));
writer.write(testCases[i]);
writer.newLine();
writer.close();
try {
doIndexAndSearchTest(file, false, null);
fail("Some exception should have been thrown for: [" + testCases[i] + "]");
} catch (Exception e) {
// expected.
}
}
}
}

View File

@ -0,0 +1,134 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Properties;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
import org.apache.lucene.benchmark.byTask.feeds.DocData;
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
/** Tests the functionality of {@link WriteLineDocTask}. */
public class WriteLineDocTaskTest extends BenchmarkTestCase {
// class has to be public so that Class.forName.newInstance() will work
public static final class WriteLineDocMaker extends BasicDocMaker {
protected DocData getNextDocData() throws NoMoreDataException, Exception {
throw new UnsupportedOperationException("not implemented");
}
public Document makeDocument() throws Exception {
Document doc = new Document();
doc.add(new Field(BODY_FIELD, "body", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field(TITLE_FIELD, "title", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field(DATE_FIELD, "date", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
return doc;
}
public int numUniqueTexts() {
return 0;
}
}
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
private PerfRunData createPerfRunData(File file, boolean setBZCompress, String bz2CompressVal) throws Exception {
Properties props = new Properties();
props.setProperty("doc.maker", WriteLineDocMaker.class.getName());
props.setProperty("line.file.out", file.getAbsolutePath());
if (setBZCompress) {
props.setProperty("bzip.compression", bz2CompressVal);
}
props.setProperty("directory", "RAMDirectory"); // no accidental FS dir.
Config config = new Config(props);
return new PerfRunData(config);
}
private void doReadTest(File file, boolean bz2File) throws Exception {
InputStream in = new FileInputStream(file);
if (bz2File) {
in = csFactory.createCompressorInputStream("bzip2", in);
}
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
try {
String line = br.readLine();
assertNotNull(line);
String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
assertEquals(3, parts.length);
assertEquals("title", parts[0]);
assertEquals("date", parts[1]);
assertEquals("body", parts[2]);
assertNull(br.readLine());
} finally {
br.close();
}
}
/* Tests WriteLineDocTask with a bzip2 format. */
public void testBZip2() throws Exception {
// Create a document in bz2 format.
File file = new File(getWorkDir(), "one-line.bz2");
PerfRunData runData = createPerfRunData(file, true, "true");
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
doReadTest(file, true);
}
public void testBZip2AutoDetect() throws Exception {
// Create a document in bz2 format.
File file = new File(getWorkDir(), "one-line.bz2");
PerfRunData runData = createPerfRunData(file, false, null);
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
doReadTest(file, true);
}
public void testRegularFile() throws Exception {
// Create a document in regular format.
File file = new File(getWorkDir(), "one-line");
PerfRunData runData = createPerfRunData(file, true, "false");
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
doReadTest(file, false);
}
}