mirror of https://github.com/apache/lucene.git
LUCENE-1591: add bzip2 compression/decompress to contrib/benchmark
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@765543 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fb8bb98c2a
commit
8c4fff6e21
|
@ -50,6 +50,11 @@ New features
|
||||||
a field needs to use a custom Collator. (Steven Rowe via Mike
|
a field needs to use a custom Collator. (Steven Rowe via Mike
|
||||||
McCandless)
|
McCandless)
|
||||||
|
|
||||||
|
4. LUCENE-1591: EnWikiDocMaker, LineDocMaker, WriteLineDoc can now
|
||||||
|
read/write bz2 using Apache commons compress library. This means
|
||||||
|
you can download the .bz2 export from http://wikipedia.org and
|
||||||
|
immediately index it. (Shai Erera via Mike McCandless)
|
||||||
|
|
||||||
|
|
||||||
Documentation
|
Documentation
|
||||||
|
|
||||||
|
|
|
@ -100,23 +100,14 @@
|
||||||
<antcall target="expand-reuters"/>
|
<antcall target="expand-reuters"/>
|
||||||
<antcall target="extract-reuters"/>
|
<antcall target="extract-reuters"/>
|
||||||
</target>
|
</target>
|
||||||
<property name="digester.jar" value="commons-digester-1.7.jar"/>
|
|
||||||
<property name="collections.jar" value="commons-collections-3.1.jar"/>
|
|
||||||
<property name="logging.jar" value="commons-logging-1.0.4.jar"/>
|
|
||||||
<property name="bean-utils.jar" value="commons-beanutils-1.7.0.jar"/>
|
|
||||||
<property name="xercesImpl.jar" value="xerces-2.9.1-patched-XERCESJ-1257.jar"/>
|
|
||||||
<property name="xml-apis.jar" value="xml-apis-2.9.0.jar"/>
|
|
||||||
|
|
||||||
<path id="classpath">
|
<path id="classpath">
|
||||||
<pathelement path="${common.dir}/build/classes/java"/>
|
<pathelement path="${common.dir}/build/classes/java"/>
|
||||||
<pathelement path="${common.dir}/build/classes/demo"/>
|
<pathelement path="${common.dir}/build/classes/demo"/>
|
||||||
<pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
|
<pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
|
||||||
<pathelement path="lib/${digester.jar}"/>
|
<fileset dir="lib">
|
||||||
<pathelement path="lib/${collections.jar}"/>
|
<include name="**/*.jar"/>
|
||||||
<pathelement path="lib/${logging.jar}"/>
|
</fileset>
|
||||||
<pathelement path="lib/${bean-utils.jar}"/>
|
|
||||||
<pathelement path="lib/${xercesImpl.jar}"/>
|
|
||||||
<pathelement path="lib/${xml-apis.jar}"/>
|
|
||||||
</path>
|
</path>
|
||||||
<path id="run.classpath">
|
<path id="run.classpath">
|
||||||
<path refid="classpath"/>
|
<path refid="classpath"/>
|
||||||
|
|
|
@ -17,49 +17,75 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.xml.sax.XMLReader;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
import org.xml.sax.Attributes;
|
import org.xml.sax.Attributes;
|
||||||
import org.xml.sax.InputSource;
|
import org.xml.sax.InputSource;
|
||||||
import org.xml.sax.SAXException;
|
import org.xml.sax.SAXException;
|
||||||
|
import org.xml.sax.XMLReader;
|
||||||
import org.xml.sax.helpers.DefaultHandler;
|
import org.xml.sax.helpers.DefaultHandler;
|
||||||
import org.xml.sax.helpers.XMLReaderFactory;
|
import org.xml.sax.helpers.XMLReaderFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A LineDocMaker which reads the uncompressed english wikipedia dump.
|
* A {@link LineDocMaker} which reads the english wikipedia
|
||||||
*
|
* dump. You can read the .bz2 file directly (it will be
|
||||||
|
* decompressed on the fly).
|
||||||
* Config properties:
|
* Config properties:
|
||||||
* keep.image.only.docs=false|true
|
* <ul>
|
||||||
* <br/>
|
* <li>keep.image.only.docs=false|true
|
||||||
* Plus those available in LineDocMaker
|
* <li>[those available in {@link LineDocMaker}]
|
||||||
*
|
* </ul>
|
||||||
*
|
*
|
||||||
* @see org.apache.lucene.benchmark.byTask.feeds.LineDocMaker
|
* @see org.apache.lucene.benchmark.byTask.feeds.LineDocMaker
|
||||||
*/
|
*/
|
||||||
public class EnwikiDocMaker extends LineDocMaker {
|
public class EnwikiDocMaker extends LineDocMaker {
|
||||||
protected boolean keepImages = true;
|
|
||||||
|
private static final Map ELEMENTS = new HashMap();
|
||||||
|
|
||||||
static final int TITLE = 0;
|
static final int TITLE = 0;
|
||||||
static final int DATE = TITLE+1;
|
static final int DATE = TITLE + 1;
|
||||||
static final int BODY = DATE+1;
|
static final int BODY = DATE + 1;
|
||||||
static final int ID = BODY + 1;
|
static final int ID = BODY + 1;
|
||||||
static final int LENGTH = ID+1;
|
static final int LENGTH = ID + 1;
|
||||||
|
// LENGTH is used as the size of the tuple, so whatever constants we need that
|
||||||
|
// should not be part of the tuple, we should define them after LENGTH.
|
||||||
|
static final int PAGE = LENGTH + 1;
|
||||||
|
|
||||||
static final String[] months = {"JAN", "FEB", "MAR", "APR",
|
static final String[] months = {"JAN", "FEB", "MAR", "APR",
|
||||||
"MAY", "JUN", "JUL", "AUG",
|
"MAY", "JUN", "JUL", "AUG",
|
||||||
"SEP", "OCT", "NOV", "DEC"};
|
"SEP", "OCT", "NOV", "DEC"};
|
||||||
|
|
||||||
|
static {
|
||||||
|
ELEMENTS.put("page", new Integer(PAGE));
|
||||||
|
ELEMENTS.put("text", new Integer(BODY));
|
||||||
|
ELEMENTS.put("timestamp", new Integer(DATE));
|
||||||
|
ELEMENTS.put("title", new Integer(TITLE));
|
||||||
|
ELEMENTS.put("id", new Integer(ID));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the type of the element if defined, otherwise returns -1. This
|
||||||
|
* method is useful in startElement and endElement, by not needing to compare
|
||||||
|
* the element qualified name over and over.
|
||||||
|
*/
|
||||||
|
private final static int getElementType(String elem) {
|
||||||
|
Integer val = (Integer) ELEMENTS.get(elem);
|
||||||
|
return val == null ? -1 : val.intValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean keepImages = true;
|
||||||
|
|
||||||
public void setConfig(Config config) {
|
public void setConfig(Config config) {
|
||||||
super.setConfig(config);
|
super.setConfig(config);
|
||||||
keepImages = config.get("keep.image.only.docs", true);
|
keepImages = config.get("keep.image.only.docs", true);
|
||||||
}
|
}
|
||||||
|
|
||||||
class Parser extends DefaultHandler implements Runnable {
|
class Parser extends DefaultHandler implements Runnable {
|
||||||
|
|
||||||
Thread t;
|
Thread t;
|
||||||
boolean threadDone;
|
boolean threadDone;
|
||||||
|
|
||||||
|
@ -71,7 +97,7 @@ public class EnwikiDocMaker extends LineDocMaker {
|
||||||
reader.setContentHandler(this);
|
reader.setContentHandler(this);
|
||||||
reader.setErrorHandler(this);
|
reader.setErrorHandler(this);
|
||||||
while(true){
|
while(true){
|
||||||
final FileInputStream localFileIS = fileIS;
|
final InputStream localFileIS = fileIS;
|
||||||
try {
|
try {
|
||||||
InputSource is = new InputSource(localFileIS);
|
InputSource is = new InputSource(localFileIS);
|
||||||
reader.parse(is);
|
reader.parse(is);
|
||||||
|
@ -133,12 +159,13 @@ public class EnwikiDocMaker extends LineDocMaker {
|
||||||
t = null;
|
t = null;
|
||||||
throw nmde;
|
throw nmde;
|
||||||
}
|
}
|
||||||
if (t != null && threadDone)
|
if (t != null && threadDone) {
|
||||||
// The thread has exited yet did not hit end of
|
// The thread has exited yet did not hit end of
|
||||||
// data, so this means it hit an exception. We
|
// data, so this means it hit an exception. We
|
||||||
// throw NoMorDataException here to force
|
// throw NoMorDataException here to force
|
||||||
// benchmark to stop the current alg:
|
// benchmark to stop the current alg:
|
||||||
throw new NoMoreDataException();
|
throw new NoMoreDataException();
|
||||||
|
}
|
||||||
result = tuple;
|
result = tuple;
|
||||||
tuple = null;
|
tuple = null;
|
||||||
notify();
|
notify();
|
||||||
|
@ -157,25 +184,27 @@ public class EnwikiDocMaker extends LineDocMaker {
|
||||||
String time;
|
String time;
|
||||||
String id;
|
String id;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public void startElement(String namespace,
|
public void startElement(String namespace,
|
||||||
String simple,
|
String simple,
|
||||||
String qualified,
|
String qualified,
|
||||||
Attributes attributes) {
|
Attributes attributes) {
|
||||||
if (qualified.equals("page")) {
|
int elemType = getElementType(qualified);
|
||||||
|
switch (elemType) {
|
||||||
|
case PAGE:
|
||||||
title = null;
|
title = null;
|
||||||
body = null;
|
body = null;
|
||||||
time = null;
|
time = null;
|
||||||
id = null;
|
id = null;
|
||||||
} else if (qualified.equals("text")) {
|
break;
|
||||||
contents.setLength(0);
|
// intentional fall-through.
|
||||||
} else if (qualified.equals("timestamp")) {
|
case BODY:
|
||||||
contents.setLength(0);
|
case DATE:
|
||||||
} else if (qualified.equals("title")) {
|
case TITLE:
|
||||||
contents.setLength(0);
|
case ID:
|
||||||
} else if (qualified.equals("id")) {
|
|
||||||
contents.setLength(0);
|
contents.setLength(0);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
// this element should be discarded.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -214,25 +243,34 @@ public class EnwikiDocMaker extends LineDocMaker {
|
||||||
|
|
||||||
public void endElement(String namespace, String simple, String qualified)
|
public void endElement(String namespace, String simple, String qualified)
|
||||||
throws SAXException {
|
throws SAXException {
|
||||||
if (qualified.equals("title")) {
|
int elemType = getElementType(qualified);
|
||||||
title = contents.toString();
|
switch (elemType) {
|
||||||
} else if (qualified.equals("text")) {
|
case PAGE:
|
||||||
|
// the body must be null and we either are keeping image docs or the
|
||||||
|
// title does not start with Image:
|
||||||
|
if (body != null && (keepImages || !title.startsWith("Image:"))) {
|
||||||
|
create(title, time, body, id);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case BODY:
|
||||||
body = contents.toString();
|
body = contents.toString();
|
||||||
//workaround that startswith doesn't have an ignore case option, get at least 20 chars.
|
//workaround that startswith doesn't have an ignore case option, get at least 20 chars.
|
||||||
String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
|
String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
|
||||||
if (startsWith.startsWith("#redirect")) {
|
if (startsWith.startsWith("#redirect")) {
|
||||||
body = null;
|
body = null;
|
||||||
}
|
}
|
||||||
} else if (qualified.equals("timestamp")) {
|
break;
|
||||||
|
case DATE:
|
||||||
time = time(contents.toString());
|
time = time(contents.toString());
|
||||||
} else if (qualified.equals("id") && id == null) {//just get the first id
|
break;
|
||||||
|
case TITLE:
|
||||||
|
title = contents.toString();
|
||||||
|
break;
|
||||||
|
case ID:
|
||||||
id = contents.toString();
|
id = contents.toString();
|
||||||
}
|
break;
|
||||||
else if (qualified.equals("page")) {
|
default:
|
||||||
//the body must be null and we either are keeping image docs or the title does not start with Image:
|
// this element should be discarded.
|
||||||
if (body != null && (keepImages == true || title.startsWith("Image:") == false)) {
|
|
||||||
create(title, time, body, id);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,38 +17,44 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import java.io.BufferedInputStream;
|
||||||
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
|
|
||||||
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.document.Field;
|
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
|
||||||
|
import org.apache.commons.compress.compressors.CompressorException;
|
||||||
|
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||||
|
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
|
||||||
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A DocMaker reading one line at a time as a Document from
|
* A DocMaker reading one line at a time as a Document from a single file. This
|
||||||
* a single file. This saves IO cost (over DirDocMaker) of
|
* saves IO cost (over DirDocMaker) of recursing through a directory and opening
|
||||||
* recursing through a directory and opening a new file for
|
* a new file for every document. It also re-uses its Document and Field
|
||||||
* every document. It also re-uses its Document and Field
|
* instance to improve indexing speed.<br>
|
||||||
* instance to improve indexing speed.
|
* The expected format of each line is (arguments are separated by <TAB>):
|
||||||
|
* <i>title, date, body</i>. If a line is read in a different format, a
|
||||||
|
* {@link RuntimeException} will be thrown. In general, you should use this doc
|
||||||
|
* maker with files that were created with {@link WriteLineDocTask}.<br><br>
|
||||||
*
|
*
|
||||||
* Config properties:
|
* Config properties:
|
||||||
* docs.file=<path to the file%gt;
|
* <ul>
|
||||||
* doc.reuse.fields=true|false (default true)
|
* <li>docs.file=<path to the file>
|
||||||
* doc.random.id.limit=N (default -1) -- create random
|
* <li>doc.reuse.fields=true|false (default true)
|
||||||
* docid in the range 0..N; this is useful
|
* <li>bzip.compression=true|false (default false)
|
||||||
* with UpdateDoc to test updating random documents; if
|
* <li>doc.random.id.limit=N (default -1) -- create random docid in the range
|
||||||
* this is unspecified or -1, then docid is sequentially
|
* 0..N; this is useful with UpdateDoc to test updating random documents; if
|
||||||
* assigned
|
* this is unspecified or -1, then docid is sequentially assigned
|
||||||
|
* </ul>
|
||||||
*/
|
*/
|
||||||
public class LineDocMaker extends BasicDocMaker {
|
public class LineDocMaker extends BasicDocMaker {
|
||||||
|
|
||||||
FileInputStream fileIS;
|
InputStream fileIS;
|
||||||
BufferedReader fileIn;
|
BufferedReader fileIn;
|
||||||
ThreadLocal docState = new ThreadLocal();
|
ThreadLocal docState = new ThreadLocal();
|
||||||
private String fileName;
|
private String fileName;
|
||||||
|
@ -57,9 +63,12 @@ public class LineDocMaker extends BasicDocMaker {
|
||||||
private final DocState localDocState = new DocState();
|
private final DocState localDocState = new DocState();
|
||||||
|
|
||||||
private boolean doReuseFields = true;
|
private boolean doReuseFields = true;
|
||||||
|
private boolean bzipCompressionEnabled = false;
|
||||||
private Random r;
|
private Random r;
|
||||||
private int numDocs;
|
private int numDocs;
|
||||||
|
|
||||||
|
private CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||||
|
|
||||||
class DocState {
|
class DocState {
|
||||||
Document doc;
|
Document doc;
|
||||||
Field bodyField;
|
Field bodyField;
|
||||||
|
@ -93,7 +102,7 @@ public class LineDocMaker extends BasicDocMaker {
|
||||||
doc.add(idField);
|
doc.add(idField);
|
||||||
}
|
}
|
||||||
|
|
||||||
final static String SEP = WriteLineDocTask.SEP;
|
final static char SEP = WriteLineDocTask.SEP;
|
||||||
|
|
||||||
private int numDocsCreated;
|
private int numDocsCreated;
|
||||||
private synchronized int incrNumDocsCreated() {
|
private synchronized int incrNumDocsCreated() {
|
||||||
|
@ -101,27 +110,20 @@ public class LineDocMaker extends BasicDocMaker {
|
||||||
}
|
}
|
||||||
|
|
||||||
public Document setFields(String line) {
|
public Document setFields(String line) {
|
||||||
|
// A line must be in the following format. If it's not, fail !
|
||||||
// title <TAB> date <TAB> body <NEWLINE>
|
// title <TAB> date <TAB> body <NEWLINE>
|
||||||
final String title, date, body;
|
|
||||||
|
|
||||||
int spot = line.indexOf(SEP);
|
int spot = line.indexOf(SEP);
|
||||||
if (spot != -1) {
|
if (spot == -1) {
|
||||||
title = line.substring(0, spot);
|
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
|
||||||
int spot2 = line.indexOf(SEP, 1+spot);
|
|
||||||
if (spot2 != -1) {
|
|
||||||
date = line.substring(1+spot, spot2);
|
|
||||||
body = line.substring(1+spot2, line.length());
|
|
||||||
} else
|
|
||||||
date = body = "";
|
|
||||||
} else
|
|
||||||
title = date = body = "";
|
|
||||||
|
|
||||||
final String docID;
|
|
||||||
if (r != null) {
|
|
||||||
docID = "doc" + r.nextInt(numDocs);
|
|
||||||
} else {
|
|
||||||
docID = "doc" + incrNumDocsCreated();
|
|
||||||
}
|
}
|
||||||
|
int spot2 = line.indexOf(SEP, 1 + spot);
|
||||||
|
if (spot2 == -1) {
|
||||||
|
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
|
||||||
|
}
|
||||||
|
final String title = line.substring(0, spot);
|
||||||
|
final String date = line.substring(1+spot, spot2);
|
||||||
|
final String body = line.substring(1+spot2, line.length());
|
||||||
|
final String docID = "doc" + (r != null ? r.nextInt(numDocs) : incrNumDocsCreated());
|
||||||
|
|
||||||
if (doReuseFields) {
|
if (doReuseFields) {
|
||||||
idField.setValue(docID);
|
idField.setValue(docID);
|
||||||
|
@ -130,7 +132,10 @@ public class LineDocMaker extends BasicDocMaker {
|
||||||
bodyField.setValue(body);
|
bodyField.setValue(body);
|
||||||
return doc;
|
return doc;
|
||||||
} else {
|
} else {
|
||||||
Field localIDField = new Field(BasicDocMaker.ID_FIELD, docID, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
|
Field localIDField = new Field(BasicDocMaker.ID_FIELD,
|
||||||
|
docID,
|
||||||
|
Field.Store.YES,
|
||||||
|
Field.Index.NOT_ANALYZED_NO_NORMS);
|
||||||
|
|
||||||
Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD,
|
Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD,
|
||||||
title,
|
title,
|
||||||
|
@ -174,16 +179,14 @@ public class LineDocMaker extends BasicDocMaker {
|
||||||
|
|
||||||
String line;
|
String line;
|
||||||
synchronized(this) {
|
synchronized(this) {
|
||||||
while(true) {
|
|
||||||
line = fileIn.readLine();
|
line = fileIn.readLine();
|
||||||
if (line == null) {
|
if (line == null) {
|
||||||
|
if (!forever) {
|
||||||
|
throw new NoMoreDataException();
|
||||||
|
}
|
||||||
// Reset the file
|
// Reset the file
|
||||||
openFile();
|
openFile();
|
||||||
if (!forever)
|
return makeDocument();
|
||||||
throw new NoMoreDataException();
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -199,15 +202,24 @@ public class LineDocMaker extends BasicDocMaker {
|
||||||
|
|
||||||
public synchronized void resetInputs() {
|
public synchronized void resetInputs() {
|
||||||
super.resetInputs();
|
super.resetInputs();
|
||||||
fileName = config.get("docs.file", null);
|
|
||||||
if (fileName == null)
|
|
||||||
throw new RuntimeException("docs.file must be set");
|
|
||||||
openFile();
|
openFile();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setConfig(Config config) {
|
public void setConfig(Config config) {
|
||||||
super.setConfig(config);
|
super.setConfig(config);
|
||||||
|
fileName = config.get("docs.file", null);
|
||||||
|
if (fileName == null) {
|
||||||
|
throw new IllegalArgumentException("docs.file must be set");
|
||||||
|
}
|
||||||
doReuseFields = config.get("doc.reuse.fields", true);
|
doReuseFields = config.get("doc.reuse.fields", true);
|
||||||
|
String doBZCompress = config.get("bzip.compression", null);
|
||||||
|
if (doBZCompress != null) {
|
||||||
|
// Property was set, use the value.
|
||||||
|
bzipCompressionEnabled = Boolean.valueOf(doBZCompress).booleanValue();
|
||||||
|
} else {
|
||||||
|
// Property was not set, attempt to detect based on file's extension
|
||||||
|
bzipCompressionEnabled = fileName.endsWith("bz2");
|
||||||
|
}
|
||||||
numDocs = config.get("doc.random.id.limit", -1);
|
numDocs = config.get("doc.random.id.limit", -1);
|
||||||
if (numDocs != -1) {
|
if (numDocs != -1) {
|
||||||
r = new Random(179);
|
r = new Random(179);
|
||||||
|
@ -216,16 +228,35 @@ public class LineDocMaker extends BasicDocMaker {
|
||||||
|
|
||||||
synchronized void openFile() {
|
synchronized void openFile() {
|
||||||
try {
|
try {
|
||||||
if (fileIn != null)
|
if (fileIn != null) {
|
||||||
fileIn.close();
|
fileIn.close();
|
||||||
|
}
|
||||||
fileIS = new FileInputStream(fileName);
|
fileIS = new FileInputStream(fileName);
|
||||||
fileIn = new BufferedReader(new InputStreamReader(fileIS,"UTF-8"), READER_BUFFER_BYTES);
|
if (bzipCompressionEnabled) {
|
||||||
|
// According to BZip2CompressorInputStream's code, it reads the first
|
||||||
|
// two file header chars ('B' and 'Z'). We only need to wrap the
|
||||||
|
// underlying stream with a BufferedInputStream, since the code uses
|
||||||
|
// the read() method exclusively.
|
||||||
|
fileIS = new BufferedInputStream(fileIS, READER_BUFFER_BYTES);
|
||||||
|
fileIS = csFactory.createCompressorInputStream("bzip2", fileIS);
|
||||||
|
}
|
||||||
|
// Wrap the stream with a BufferedReader for several reasons:
|
||||||
|
// 1. We need the readLine() method.
|
||||||
|
// 2. Even if bzip.compression is enabled, and is wrapped with
|
||||||
|
// BufferedInputStream, wrapping with a buffer can still improve
|
||||||
|
// performance, since the BIS buffer will be used to read from the
|
||||||
|
// compressed stream, while the BR buffer will be used to read from the
|
||||||
|
// uncompressed stream.
|
||||||
|
fileIn = new BufferedReader(new InputStreamReader(fileIS, "UTF-8"), READER_BUFFER_BYTES);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
|
} catch (CompressorException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public int numUniqueTexts() {
|
public int numUniqueTexts() {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,18 +17,39 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.BufferedOutputStream;
|
||||||
import java.io.BufferedWriter;
|
import java.io.BufferedWriter;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
import java.io.OutputStreamWriter;
|
import java.io.OutputStreamWriter;
|
||||||
|
|
||||||
|
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
|
||||||
import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
|
import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
|
||||||
|
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A task which writes documents, one line per document. Each line is in the
|
||||||
|
* following format: title <TAB> date <TAB> body. The output of this
|
||||||
|
* taske can be consumed by
|
||||||
|
* {@link org.apache.lucene.benchmark.byTask.feeds.LineDocMaker} and is intended
|
||||||
|
* to save the IO overhead of opening a file per doument to be indexed.<br>
|
||||||
|
*
|
||||||
|
* Supports the following parameters:
|
||||||
|
* <ul>
|
||||||
|
* <li>line.file.out - the name of the file to write the output to. That
|
||||||
|
* parameter is mandatory. <b>NOTE:</b> the file is re-created.
|
||||||
|
* <li>bzip.compression - whether the output should be bzip-compressed. This is
|
||||||
|
* recommended when the output file is expected to be large. (optional, default:
|
||||||
|
* false).
|
||||||
|
* <li>doc.writeline.log.step - controls how many records to process before
|
||||||
|
* logging the status of the task. <b>NOTE:</b> to disable logging, set this
|
||||||
|
* value to 0 or negative. (optional, default:1000).
|
||||||
|
* </ul>
|
||||||
|
*/
|
||||||
public class WriteLineDocTask extends PerfTask {
|
public class WriteLineDocTask extends PerfTask {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -36,33 +57,48 @@ public class WriteLineDocTask extends PerfTask {
|
||||||
* an "added N docs" message should be logged.
|
* an "added N docs" message should be logged.
|
||||||
*/
|
*/
|
||||||
public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000;
|
public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000;
|
||||||
|
public final static char SEP = '\t';
|
||||||
public WriteLineDocTask(PerfRunData runData) {
|
|
||||||
super(runData);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int logStep = -1;
|
private int logStep = -1;
|
||||||
private int docSize = 0;
|
private int docSize = 0;
|
||||||
int count = 0;
|
int count = 0;
|
||||||
private BufferedWriter lineFileOut=null;
|
private BufferedWriter lineFileOut = null;
|
||||||
private DocMaker docMaker;
|
private DocMaker docMaker;
|
||||||
|
|
||||||
public final static String SEP = "\t";
|
public WriteLineDocTask(PerfRunData runData) throws Exception {
|
||||||
|
super(runData);
|
||||||
/*
|
Config config = runData.getConfig();
|
||||||
* (non-Javadoc)
|
|
||||||
* @see PerfTask#setup()
|
|
||||||
*/
|
|
||||||
public void setup() throws Exception {
|
|
||||||
super.setup();
|
|
||||||
if (lineFileOut==null) {
|
|
||||||
Config config = getRunData().getConfig();
|
|
||||||
String fileName = config.get("line.file.out", null);
|
String fileName = config.get("line.file.out", null);
|
||||||
if (fileName == null)
|
if (fileName == null) {
|
||||||
throw new Exception("line.file.out must be set");
|
throw new IllegalArgumentException("line.file.out must be set");
|
||||||
lineFileOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName),"UTF-8"));
|
}
|
||||||
|
|
||||||
|
OutputStream out = new FileOutputStream(fileName);
|
||||||
|
boolean doBzipCompression = false;
|
||||||
|
String doBZCompress = config.get("bzip.compression", null);
|
||||||
|
if (doBZCompress != null) {
|
||||||
|
// Property was set, use the value.
|
||||||
|
doBzipCompression = Boolean.valueOf(doBZCompress).booleanValue();
|
||||||
|
} else {
|
||||||
|
// Property was not set, attempt to detect based on file's extension
|
||||||
|
doBzipCompression = fileName.endsWith("bz2");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (doBzipCompression) {
|
||||||
|
// Wrap with BOS since BZip2CompressorOutputStream calls out.write(int)
|
||||||
|
// and does not use the write(byte[]) version. This proved to speed the
|
||||||
|
// compression process by 70% !
|
||||||
|
out = new BufferedOutputStream(out, 1 << 16);
|
||||||
|
out = new CompressorStreamFactory().createCompressorOutputStream("bzip2", out);
|
||||||
|
}
|
||||||
|
lineFileOut = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16);
|
||||||
|
docMaker = runData.getDocMaker();
|
||||||
|
logStep = config.get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
|
||||||
|
// To avoid the check 'if (logStep > 0)' in log(). This effectively turns
|
||||||
|
// logging off.
|
||||||
|
if (logStep <= 0) {
|
||||||
|
logStep = Integer.MAX_VALUE;
|
||||||
}
|
}
|
||||||
docMaker = getRunData().getDocMaker();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void tearDown() throws Exception {
|
public void tearDown() throws Exception {
|
||||||
|
@ -71,53 +107,42 @@ public class WriteLineDocTask extends PerfTask {
|
||||||
}
|
}
|
||||||
|
|
||||||
public int doLogic() throws Exception {
|
public int doLogic() throws Exception {
|
||||||
Document doc;
|
Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument();
|
||||||
if (docSize > 0) {
|
|
||||||
doc = docMaker.makeDocument(docSize);
|
|
||||||
} else {
|
|
||||||
doc = docMaker.makeDocument();
|
|
||||||
}
|
|
||||||
|
|
||||||
Field f = doc.getField(BasicDocMaker.BODY_FIELD);
|
Field f = doc.getField(BasicDocMaker.BODY_FIELD);
|
||||||
|
String body = f != null ? f.stringValue().replace('\t', ' ') : null;
|
||||||
String body, title, date;
|
|
||||||
if (f != null)
|
|
||||||
body = f.stringValue().replace('\t', ' ');
|
|
||||||
else
|
|
||||||
body = null;
|
|
||||||
|
|
||||||
f = doc.getField(BasicDocMaker.TITLE_FIELD);
|
|
||||||
if (f != null)
|
|
||||||
title = f.stringValue().replace('\t', ' ');
|
|
||||||
else
|
|
||||||
title = "";
|
|
||||||
|
|
||||||
f = doc.getField(BasicDocMaker.DATE_FIELD);
|
|
||||||
if (f != null)
|
|
||||||
date = f.stringValue().replace('\t', ' ');
|
|
||||||
else
|
|
||||||
date = "";
|
|
||||||
|
|
||||||
if (body != null) {
|
if (body != null) {
|
||||||
|
f = doc.getField(BasicDocMaker.TITLE_FIELD);
|
||||||
|
String title = f != null ? f.stringValue().replace('\t', ' ') : "";
|
||||||
|
|
||||||
|
f = doc.getField(BasicDocMaker.DATE_FIELD);
|
||||||
|
String date = f != null ? f.stringValue().replace('\t', ' ') : "";
|
||||||
|
|
||||||
lineFileOut.write(title, 0, title.length());
|
lineFileOut.write(title, 0, title.length());
|
||||||
lineFileOut.write(SEP);
|
lineFileOut.write(SEP);
|
||||||
lineFileOut.write(date, 0, date.length());
|
lineFileOut.write(date, 0, date.length());
|
||||||
lineFileOut.write(SEP);
|
lineFileOut.write(SEP);
|
||||||
lineFileOut.write(body, 0, body.length());
|
lineFileOut.write(body, 0, body.length());
|
||||||
lineFileOut.newLine();
|
lineFileOut.newLine();
|
||||||
lineFileOut.flush();
|
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void log (int count) {
|
private void log(int count) {
|
||||||
if (logStep<0) {
|
// logStep is initialized in the ctor to a positive value. If the config
|
||||||
// init once per instance
|
// file indicates no logging, or contains an invalid value, logStep is init
|
||||||
logStep = getRunData().getConfig().get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
|
// to Integer.MAX_VALUE, so that logging will not occur (at least for the
|
||||||
|
// first Integer.MAX_VALUE records).
|
||||||
|
if (count % logStep == 0) {
|
||||||
|
System.out.println("--> " + Thread.currentThread().getName()
|
||||||
|
+ " processed (write line) " + count + " docs");
|
||||||
}
|
}
|
||||||
if (logStep>0 && (count%logStep)==0) {
|
|
||||||
System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+" docs");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void close() throws Exception {
|
||||||
|
lineFileOut.close();
|
||||||
|
super.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -125,7 +150,9 @@ public class WriteLineDocTask extends PerfTask {
|
||||||
* @param params docSize, or 0 for no limit.
|
* @param params docSize, or 0 for no limit.
|
||||||
*/
|
*/
|
||||||
public void setParams(String params) {
|
public void setParams(String params) {
|
||||||
|
if (super.supportsParams()) {
|
||||||
super.setParams(params);
|
super.setParams(params);
|
||||||
|
}
|
||||||
docSize = (int) Float.parseFloat(params);
|
docSize = (int) Float.parseFloat(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -135,4 +162,5 @@ public class WriteLineDocTask extends PerfTask {
|
||||||
public boolean supportsParams() {
|
public boolean supportsParams() {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,38 @@
|
||||||
|
package org.apache.lucene.benchmark;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
/** Base class for all Benchmark unit tests. */
|
||||||
|
public class BenchmarkTestCase extends TestCase {
|
||||||
|
|
||||||
|
private static final File workDir;
|
||||||
|
|
||||||
|
static {
|
||||||
|
workDir = new File(System.getProperty("benchmark.work.dir", "test/benchmark")).getAbsoluteFile();
|
||||||
|
workDir.mkdirs();
|
||||||
|
}
|
||||||
|
|
||||||
|
public File getWorkDir() {
|
||||||
|
return workDir;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -17,188 +17,33 @@
|
||||||
|
|
||||||
package org.apache.lucene.benchmark.byTask;
|
package org.apache.lucene.benchmark.byTask;
|
||||||
|
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
|
import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
|
||||||
import org.apache.lucene.benchmark.byTask.tasks.TaskSequence;
|
import org.apache.lucene.benchmark.byTask.tasks.TaskSequence;
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Algorithm;
|
import org.apache.lucene.benchmark.byTask.utils.Algorithm;
|
||||||
|
|
||||||
import java.io.File;
|
/** Test very simply that perf tasks are parses as expected. */
|
||||||
import java.io.StringReader;
|
|
||||||
import java.lang.reflect.Modifier;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Iterator;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Test very simply that perf tasks are parses as expected.
|
|
||||||
*/
|
|
||||||
public class TestPerfTasksParse extends TestCase {
|
public class TestPerfTasksParse extends TestCase {
|
||||||
|
|
||||||
private static final boolean DEBUG = false;
|
|
||||||
static final String NEW_LINE = System.getProperty("line.separator");
|
static final String NEW_LINE = System.getProperty("line.separator");
|
||||||
static final String INDENT = " ";
|
static final String INDENT = " ";
|
||||||
|
|
||||||
// properties in effect in all tests here
|
// properties in effect in all tests here
|
||||||
static final String propPart =
|
static final String propPart =
|
||||||
INDENT+"directory=RAMDirectory" + NEW_LINE +
|
INDENT + "directory=RAMDirectory" + NEW_LINE +
|
||||||
INDENT+"print.props=false" + NEW_LINE
|
INDENT + "print.props=false" + NEW_LINE
|
||||||
;
|
;
|
||||||
|
|
||||||
/*
|
|
||||||
* All known tasks.
|
|
||||||
* As new tasks are added, add them here.
|
|
||||||
* It would be nice to do that automatically, unfortunately
|
|
||||||
* Java does not provide a "get all classes in package" or
|
|
||||||
* "get all sub-classes" functionality.
|
|
||||||
*/
|
|
||||||
static String singleTaskAlgs [];
|
|
||||||
|
|
||||||
/* (non-Javadoc)
|
|
||||||
* @see junit.framework.TestCase#setUp()
|
|
||||||
*/
|
|
||||||
protected void setUp() throws Exception {
|
|
||||||
super.setUp();
|
|
||||||
if (singleTaskAlgs==null) {
|
|
||||||
singleTaskAlgs = findTasks();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// one time initialization
|
|
||||||
static String [] findTasks () throws Exception {
|
|
||||||
ArrayList tsks = new ArrayList();
|
|
||||||
// init with tasks we know about
|
|
||||||
tsks.add( " AddDoc " );
|
|
||||||
tsks.add( " AddDoc(1000.0) " );
|
|
||||||
tsks.add( " ClearStats " );
|
|
||||||
tsks.add( " CloseIndex " );
|
|
||||||
tsks.add( " CloseReader " );
|
|
||||||
tsks.add( " CreateIndex " );
|
|
||||||
tsks.add( " DeleteDoc " );
|
|
||||||
tsks.add( " DeleteDoc(500.0) " );
|
|
||||||
tsks.add( " NewRound " );
|
|
||||||
tsks.add( " OpenIndex " );
|
|
||||||
tsks.add( " OpenReader " );
|
|
||||||
tsks.add( " Optimize " );
|
|
||||||
tsks.add( " RepAll " );
|
|
||||||
tsks.add( " RepSelectByPref prefix " );
|
|
||||||
tsks.add( " RepSumByNameRound " );
|
|
||||||
tsks.add( " RepSumByName " );
|
|
||||||
tsks.add( " RepSumByPrefRound prefix " );
|
|
||||||
tsks.add( " RepSumByPref prefix " );
|
|
||||||
tsks.add( " ResetInputs " );
|
|
||||||
tsks.add( " ResetSystemErase " );
|
|
||||||
tsks.add( " ResetSystemSoft " );
|
|
||||||
tsks.add( " Search " );
|
|
||||||
tsks.add( " SearchTravRet " );
|
|
||||||
tsks.add( " SearchTravRet(100.0) " );
|
|
||||||
tsks.add( " SearchTrav " );
|
|
||||||
tsks.add( " SearchTrav(50.0) " );
|
|
||||||
tsks.add( " SetProp " );
|
|
||||||
tsks.add( " SetProp(name,value) " );
|
|
||||||
tsks.add( " Warm " );
|
|
||||||
tsks.add( "SearchTravRetLoadFieldSelector");
|
|
||||||
tsks.add("SearchTravRetLoadFieldSelector(body,title)");
|
|
||||||
|
|
||||||
// if tasks.dir property is defined, look for additional tasks.
|
|
||||||
// this somewhat covers tasks that would be added in the future, in case
|
|
||||||
// the list above is not updated to cover them.
|
|
||||||
// some tasks would be tested more than once this way, but that's ok.
|
|
||||||
String tasksDir = System.getProperty("tasks.dir");
|
|
||||||
if (tasksDir !=null) {
|
|
||||||
String pkgPrefix = PerfTask.class.getPackage().getName()+".";
|
|
||||||
String taskNames[] = new File(tasksDir).list();
|
|
||||||
for (int i = 0; i < taskNames.length; i++) {
|
|
||||||
String name = taskNames[i].trim();
|
|
||||||
if (!name.endsWith("Task.class"))
|
|
||||||
continue; // Task class file only
|
|
||||||
name = name.substring(0,name.length()-6);
|
|
||||||
Class cls = Class.forName(pkgPrefix+name);
|
|
||||||
if (Modifier.isAbstract(cls.getModifiers()) || Modifier.isInterface(cls.getModifiers()))
|
|
||||||
continue; // skip sbstract classes
|
|
||||||
if (!PerfTask.class.isAssignableFrom(cls))
|
|
||||||
continue; // not a task
|
|
||||||
name = name.substring(0,name.length()-4);
|
|
||||||
if (name.startsWith("Rep") && name.indexOf("Pref")>=0)
|
|
||||||
name += " prefix";
|
|
||||||
tsks.add(" "+name+" ");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return (String[]) tsks.toArray(new String[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param name test name
|
|
||||||
*/
|
|
||||||
public TestPerfTasksParse(String name) {
|
public TestPerfTasksParse(String name) {
|
||||||
super(name);
|
super(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Test the repetiotion parsing for parallel tasks */
|
||||||
* Test the parsing of very simple tasks, for all tasks
|
|
||||||
*/
|
|
||||||
public void testAllTasksSimpleParse() {
|
|
||||||
doTestAllTasksSimpleParse(false,false);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Test the parsing of simple sequential sequences, for all tasks
|
|
||||||
*/
|
|
||||||
public void testAllTasksSimpleParseSequntial() {
|
|
||||||
doTestAllTasksSimpleParse(true,false);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Test the parsing of simple parallel sequences, for all tasks
|
|
||||||
*/
|
|
||||||
public void testAllTasksSimpleParseParallel() {
|
|
||||||
doTestAllTasksSimpleParse(true,true);
|
|
||||||
}
|
|
||||||
|
|
||||||
// utility for simple parsing testing of all tasks.
|
|
||||||
private void doTestAllTasksSimpleParse(boolean parOrSeq, boolean par) {
|
|
||||||
for (int i = 0; i < singleTaskAlgs.length; i++) {
|
|
||||||
String testedTask = singleTaskAlgs[i];
|
|
||||||
if (parOrSeq) {
|
|
||||||
if (par) {
|
|
||||||
testedTask = "[ " + testedTask + " ] : 2";
|
|
||||||
} else {
|
|
||||||
testedTask = "{ " + testedTask + " } : 3";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
String algText = propPart+INDENT+testedTask;
|
|
||||||
logTstParsing(algText);
|
|
||||||
Benchmark benchmark = new Benchmark(new StringReader(algText));
|
|
||||||
Algorithm alg = benchmark.getAlgorithm();
|
|
||||||
ArrayList algTasks = alg.extractTasks();
|
|
||||||
// must find a task with this name in the algorithm
|
|
||||||
boolean foundName = false;
|
|
||||||
boolean foundPar = false;
|
|
||||||
String theTask = singleTaskAlgs[i].replaceAll(" +"," ").trim();
|
|
||||||
for (Iterator iter = algTasks.iterator(); iter.hasNext();) {
|
|
||||||
PerfTask task = (PerfTask) iter.next();
|
|
||||||
foundName |= (task.toString().indexOf(theTask)>=0);
|
|
||||||
foundPar |= (task instanceof TaskSequence && ((TaskSequence)task).isParallel());
|
|
||||||
}
|
|
||||||
assertTrue("Task "+testedTask+" was not found in "+alg.toString(),foundName);
|
|
||||||
if (parOrSeq) {
|
|
||||||
if (par) {
|
|
||||||
assertTrue("Task "+testedTask+" was supposed to be parallel in "+alg.toString(),foundPar);
|
|
||||||
} else {
|
|
||||||
assertFalse("Task "+testedTask+" was not supposed to be parallel in "+alg.toString(),foundPar);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
System.out.flush();
|
|
||||||
e.printStackTrace();
|
|
||||||
fail(e.getMessage());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Test the repetiotion parsing for parallel tasks
|
|
||||||
*/
|
|
||||||
public void testParseParallelTaskSequenceRepetition() throws Exception {
|
public void testParseParallelTaskSequenceRepetition() throws Exception {
|
||||||
String taskStr = "AddDoc";
|
String taskStr = "AddDoc";
|
||||||
String parsedTasks = "[ "+taskStr+" ] : 1000";
|
String parsedTasks = "[ "+taskStr+" ] : 1000";
|
||||||
|
@ -219,9 +64,7 @@ public class TestPerfTasksParse extends TestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Test the repetiotion parsing for sequential tasks */
|
||||||
* Test the repetiotion parsing for sequential tasks
|
|
||||||
*/
|
|
||||||
public void testParseTaskSequenceRepetition() throws Exception {
|
public void testParseTaskSequenceRepetition() throws Exception {
|
||||||
String taskStr = "AddDoc";
|
String taskStr = "AddDoc";
|
||||||
String parsedTasks = "{ "+taskStr+" } : 1000";
|
String parsedTasks = "{ "+taskStr+" } : 1000";
|
||||||
|
@ -242,11 +85,4 @@ public class TestPerfTasksParse extends TestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void logTstParsing (String txt) {
|
|
||||||
if (!DEBUG)
|
|
||||||
return;
|
|
||||||
System.out.println("Test parsing of");
|
|
||||||
System.out.println(txt);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,169 @@
|
||||||
|
package org.apache.lucene.benchmark.byTask.feeds;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.BufferedWriter;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.util.Properties;
|
||||||
|
|
||||||
|
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||||
|
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||||
|
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||||
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
|
import org.apache.lucene.benchmark.byTask.tasks.AddDocTask;
|
||||||
|
import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask;
|
||||||
|
import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask;
|
||||||
|
import org.apache.lucene.benchmark.byTask.tasks.TaskSequence;
|
||||||
|
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
|
||||||
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.search.TopDocs;
|
||||||
|
|
||||||
|
/** Tests the functionality of {@link LineDocMaker}. */
|
||||||
|
public class LineDocMakerTest extends BenchmarkTestCase {
|
||||||
|
|
||||||
|
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||||
|
|
||||||
|
private void createBZ2LineFile(File file) throws Exception {
|
||||||
|
OutputStream out = new FileOutputStream(file);
|
||||||
|
out = csFactory.createCompressorOutputStream("bzip2", out);
|
||||||
|
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
|
||||||
|
StringBuffer doc = new StringBuffer();
|
||||||
|
doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
|
||||||
|
writer.write(doc.toString());
|
||||||
|
writer.newLine();
|
||||||
|
writer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createRegularLineFile(File file) throws Exception {
|
||||||
|
OutputStream out = new FileOutputStream(file);
|
||||||
|
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
|
||||||
|
StringBuffer doc = new StringBuffer();
|
||||||
|
doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
|
||||||
|
writer.write(doc.toString());
|
||||||
|
writer.newLine();
|
||||||
|
writer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doIndexAndSearchTest(File file, boolean setBZCompress,
|
||||||
|
String bz2CompressVal) throws Exception {
|
||||||
|
|
||||||
|
Properties props = new Properties();
|
||||||
|
|
||||||
|
// LineDocMaker specific settings.
|
||||||
|
props.setProperty("docs.file", file.getAbsolutePath());
|
||||||
|
if (setBZCompress) {
|
||||||
|
props.setProperty("bzip.compression", bz2CompressVal);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Indexing configuration.
|
||||||
|
props.setProperty("analyzer", SimpleAnalyzer.class.getName());
|
||||||
|
props.setProperty("doc.maker", LineDocMaker.class.getName());
|
||||||
|
props.setProperty("directory", "RAMDirectory");
|
||||||
|
|
||||||
|
// Create PerfRunData
|
||||||
|
Config config = new Config(props);
|
||||||
|
PerfRunData runData = new PerfRunData(config);
|
||||||
|
|
||||||
|
TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false);
|
||||||
|
tasks.addTask(new CreateIndexTask(runData));
|
||||||
|
tasks.addTask(new AddDocTask(runData));
|
||||||
|
tasks.addTask(new CloseIndexTask(runData));
|
||||||
|
tasks.doLogic();
|
||||||
|
|
||||||
|
IndexSearcher searcher = new IndexSearcher(runData.getDirectory());
|
||||||
|
TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10);
|
||||||
|
assertEquals(1, td.totalHits);
|
||||||
|
assertNotNull(td.scoreDocs[0]);
|
||||||
|
searcher.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Tests LineDocMaker with a bzip2 input stream. */
|
||||||
|
public void testBZip2() throws Exception {
|
||||||
|
File file = new File(getWorkDir(), "one-line.bz2");
|
||||||
|
createBZ2LineFile(file);
|
||||||
|
doIndexAndSearchTest(file, true, "true");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBZip2AutoDetect() throws Exception {
|
||||||
|
File file = new File(getWorkDir(), "one-line.bz2");
|
||||||
|
createBZ2LineFile(file);
|
||||||
|
doIndexAndSearchTest(file, false, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBZip2WithBzipCompressionDisabled() throws Exception {
|
||||||
|
File file = new File(getWorkDir(), "one-line.bz2");
|
||||||
|
createBZ2LineFile(file);
|
||||||
|
|
||||||
|
try {
|
||||||
|
doIndexAndSearchTest(file, true, "false");
|
||||||
|
fail("Some exception should have been thrown !");
|
||||||
|
} catch (Exception e) {
|
||||||
|
// expected.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRegularFile() throws Exception {
|
||||||
|
File file = new File(getWorkDir(), "one-line");
|
||||||
|
createRegularLineFile(file);
|
||||||
|
doIndexAndSearchTest(file, false, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRegularFileWithBZipCompressionEnabled() throws Exception {
|
||||||
|
File file = new File(getWorkDir(), "one-line");
|
||||||
|
createRegularLineFile(file);
|
||||||
|
|
||||||
|
try {
|
||||||
|
doIndexAndSearchTest(file, true, "true");
|
||||||
|
fail("Some exception should have been thrown !");
|
||||||
|
} catch (Exception e) {
|
||||||
|
// expected.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testInvalidFormat() throws Exception {
|
||||||
|
String[] testCases = new String[] {
|
||||||
|
"", // empty line
|
||||||
|
"title", // just title
|
||||||
|
"title" + WriteLineDocTask.SEP, // title + SEP
|
||||||
|
"title" + WriteLineDocTask.SEP + "body", // title + SEP + body
|
||||||
|
// note that title + SEP + body + SEP is a valid line, which results in an
|
||||||
|
// empty body
|
||||||
|
};
|
||||||
|
|
||||||
|
for (int i = 0; i < testCases.length; i++) {
|
||||||
|
File file = new File(getWorkDir(), "one-line");
|
||||||
|
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8"));
|
||||||
|
writer.write(testCases[i]);
|
||||||
|
writer.newLine();
|
||||||
|
writer.close();
|
||||||
|
try {
|
||||||
|
doIndexAndSearchTest(file, false, null);
|
||||||
|
fail("Some exception should have been thrown for: [" + testCases[i] + "]");
|
||||||
|
} catch (Exception e) {
|
||||||
|
// expected.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,134 @@
|
||||||
|
package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.util.Properties;
|
||||||
|
|
||||||
|
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||||
|
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||||
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
|
import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
|
||||||
|
import org.apache.lucene.benchmark.byTask.feeds.DocData;
|
||||||
|
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
|
||||||
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.Field.Index;
|
||||||
|
import org.apache.lucene.document.Field.Store;
|
||||||
|
|
||||||
|
/** Tests the functionality of {@link WriteLineDocTask}. */
|
||||||
|
public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
||||||
|
|
||||||
|
// class has to be public so that Class.forName.newInstance() will work
|
||||||
|
public static final class WriteLineDocMaker extends BasicDocMaker {
|
||||||
|
|
||||||
|
protected DocData getNextDocData() throws NoMoreDataException, Exception {
|
||||||
|
throw new UnsupportedOperationException("not implemented");
|
||||||
|
}
|
||||||
|
|
||||||
|
public Document makeDocument() throws Exception {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field(BODY_FIELD, "body", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
|
||||||
|
doc.add(new Field(TITLE_FIELD, "title", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
|
||||||
|
doc.add(new Field(DATE_FIELD, "date", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int numUniqueTexts() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||||
|
|
||||||
|
private PerfRunData createPerfRunData(File file, boolean setBZCompress, String bz2CompressVal) throws Exception {
|
||||||
|
Properties props = new Properties();
|
||||||
|
props.setProperty("doc.maker", WriteLineDocMaker.class.getName());
|
||||||
|
props.setProperty("line.file.out", file.getAbsolutePath());
|
||||||
|
if (setBZCompress) {
|
||||||
|
props.setProperty("bzip.compression", bz2CompressVal);
|
||||||
|
}
|
||||||
|
props.setProperty("directory", "RAMDirectory"); // no accidental FS dir.
|
||||||
|
Config config = new Config(props);
|
||||||
|
return new PerfRunData(config);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doReadTest(File file, boolean bz2File) throws Exception {
|
||||||
|
InputStream in = new FileInputStream(file);
|
||||||
|
if (bz2File) {
|
||||||
|
in = csFactory.createCompressorInputStream("bzip2", in);
|
||||||
|
}
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
|
||||||
|
try {
|
||||||
|
String line = br.readLine();
|
||||||
|
assertNotNull(line);
|
||||||
|
String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
|
||||||
|
assertEquals(3, parts.length);
|
||||||
|
assertEquals("title", parts[0]);
|
||||||
|
assertEquals("date", parts[1]);
|
||||||
|
assertEquals("body", parts[2]);
|
||||||
|
assertNull(br.readLine());
|
||||||
|
} finally {
|
||||||
|
br.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Tests WriteLineDocTask with a bzip2 format. */
|
||||||
|
public void testBZip2() throws Exception {
|
||||||
|
|
||||||
|
// Create a document in bz2 format.
|
||||||
|
File file = new File(getWorkDir(), "one-line.bz2");
|
||||||
|
PerfRunData runData = createPerfRunData(file, true, "true");
|
||||||
|
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||||
|
wldt.doLogic();
|
||||||
|
wldt.close();
|
||||||
|
|
||||||
|
doReadTest(file, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBZip2AutoDetect() throws Exception {
|
||||||
|
|
||||||
|
// Create a document in bz2 format.
|
||||||
|
File file = new File(getWorkDir(), "one-line.bz2");
|
||||||
|
PerfRunData runData = createPerfRunData(file, false, null);
|
||||||
|
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||||
|
wldt.doLogic();
|
||||||
|
wldt.close();
|
||||||
|
|
||||||
|
doReadTest(file, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRegularFile() throws Exception {
|
||||||
|
|
||||||
|
// Create a document in regular format.
|
||||||
|
File file = new File(getWorkDir(), "one-line");
|
||||||
|
PerfRunData runData = createPerfRunData(file, true, "false");
|
||||||
|
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||||
|
wldt.doLogic();
|
||||||
|
wldt.close();
|
||||||
|
|
||||||
|
doReadTest(file, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue