mirror of https://github.com/apache/lucene.git
LUCENE-1591: add bzip2 compression/decompress to contrib/benchmark
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@765543 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fb8bb98c2a
commit
8c4fff6e21
|
@ -50,6 +50,11 @@ New features
|
|||
a field needs to use a custom Collator. (Steven Rowe via Mike
|
||||
McCandless)
|
||||
|
||||
4. LUCENE-1591: EnWikiDocMaker, LineDocMaker, WriteLineDoc can now
|
||||
read/write bz2 using Apache commons compress library. This means
|
||||
you can download the .bz2 export from http://wikipedia.org and
|
||||
immediately index it. (Shai Erera via Mike McCandless)
|
||||
|
||||
|
||||
Documentation
|
||||
|
||||
|
|
|
@ -100,23 +100,14 @@
|
|||
<antcall target="expand-reuters"/>
|
||||
<antcall target="extract-reuters"/>
|
||||
</target>
|
||||
<property name="digester.jar" value="commons-digester-1.7.jar"/>
|
||||
<property name="collections.jar" value="commons-collections-3.1.jar"/>
|
||||
<property name="logging.jar" value="commons-logging-1.0.4.jar"/>
|
||||
<property name="bean-utils.jar" value="commons-beanutils-1.7.0.jar"/>
|
||||
<property name="xercesImpl.jar" value="xerces-2.9.1-patched-XERCESJ-1257.jar"/>
|
||||
<property name="xml-apis.jar" value="xml-apis-2.9.0.jar"/>
|
||||
|
||||
<path id="classpath">
|
||||
<pathelement path="${common.dir}/build/classes/java"/>
|
||||
<pathelement path="${common.dir}/build/classes/demo"/>
|
||||
<pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
|
||||
<pathelement path="lib/${digester.jar}"/>
|
||||
<pathelement path="lib/${collections.jar}"/>
|
||||
<pathelement path="lib/${logging.jar}"/>
|
||||
<pathelement path="lib/${bean-utils.jar}"/>
|
||||
<pathelement path="lib/${xercesImpl.jar}"/>
|
||||
<pathelement path="lib/${xml-apis.jar}"/>
|
||||
<fileset dir="lib">
|
||||
<include name="**/*.jar"/>
|
||||
</fileset>
|
||||
</path>
|
||||
<path id="run.classpath">
|
||||
<path refid="classpath"/>
|
||||
|
|
|
@ -17,49 +17,75 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.xml.sax.XMLReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.XMLReader;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
import org.xml.sax.helpers.XMLReaderFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.FileInputStream;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
|
||||
/**
|
||||
* A LineDocMaker which reads the uncompressed english wikipedia dump.
|
||||
*
|
||||
* A {@link LineDocMaker} which reads the english wikipedia
|
||||
* dump. You can read the .bz2 file directly (it will be
|
||||
* decompressed on the fly).
|
||||
* Config properties:
|
||||
* keep.image.only.docs=false|true
|
||||
* <br/>
|
||||
* Plus those available in LineDocMaker
|
||||
*
|
||||
*
|
||||
* <ul>
|
||||
* <li>keep.image.only.docs=false|true
|
||||
* <li>[those available in {@link LineDocMaker}]
|
||||
* </ul>
|
||||
*
|
||||
* @see org.apache.lucene.benchmark.byTask.feeds.LineDocMaker
|
||||
*/
|
||||
public class EnwikiDocMaker extends LineDocMaker {
|
||||
protected boolean keepImages = true;
|
||||
|
||||
private static final Map ELEMENTS = new HashMap();
|
||||
|
||||
static final int TITLE = 0;
|
||||
static final int DATE = TITLE+1;
|
||||
static final int BODY = DATE+1;
|
||||
static final int DATE = TITLE + 1;
|
||||
static final int BODY = DATE + 1;
|
||||
static final int ID = BODY + 1;
|
||||
static final int LENGTH = ID+1;
|
||||
|
||||
static final int LENGTH = ID + 1;
|
||||
// LENGTH is used as the size of the tuple, so whatever constants we need that
|
||||
// should not be part of the tuple, we should define them after LENGTH.
|
||||
static final int PAGE = LENGTH + 1;
|
||||
|
||||
static final String[] months = {"JAN", "FEB", "MAR", "APR",
|
||||
"MAY", "JUN", "JUL", "AUG",
|
||||
"SEP", "OCT", "NOV", "DEC"};
|
||||
|
||||
static {
|
||||
ELEMENTS.put("page", new Integer(PAGE));
|
||||
ELEMENTS.put("text", new Integer(BODY));
|
||||
ELEMENTS.put("timestamp", new Integer(DATE));
|
||||
ELEMENTS.put("title", new Integer(TITLE));
|
||||
ELEMENTS.put("id", new Integer(ID));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the type of the element if defined, otherwise returns -1. This
|
||||
* method is useful in startElement and endElement, by not needing to compare
|
||||
* the element qualified name over and over.
|
||||
*/
|
||||
private final static int getElementType(String elem) {
|
||||
Integer val = (Integer) ELEMENTS.get(elem);
|
||||
return val == null ? -1 : val.intValue();
|
||||
}
|
||||
|
||||
protected boolean keepImages = true;
|
||||
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
keepImages = config.get("keep.image.only.docs", true);
|
||||
}
|
||||
|
||||
class Parser extends DefaultHandler implements Runnable {
|
||||
|
||||
Thread t;
|
||||
boolean threadDone;
|
||||
|
||||
|
@ -71,7 +97,7 @@ public class EnwikiDocMaker extends LineDocMaker {
|
|||
reader.setContentHandler(this);
|
||||
reader.setErrorHandler(this);
|
||||
while(true){
|
||||
final FileInputStream localFileIS = fileIS;
|
||||
final InputStream localFileIS = fileIS;
|
||||
try {
|
||||
InputSource is = new InputSource(localFileIS);
|
||||
reader.parse(is);
|
||||
|
@ -133,12 +159,13 @@ public class EnwikiDocMaker extends LineDocMaker {
|
|||
t = null;
|
||||
throw nmde;
|
||||
}
|
||||
if (t != null && threadDone)
|
||||
if (t != null && threadDone) {
|
||||
// The thread has exited yet did not hit end of
|
||||
// data, so this means it hit an exception. We
|
||||
// throw NoMorDataException here to force
|
||||
// benchmark to stop the current alg:
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
result = tuple;
|
||||
tuple = null;
|
||||
notify();
|
||||
|
@ -157,25 +184,27 @@ public class EnwikiDocMaker extends LineDocMaker {
|
|||
String time;
|
||||
String id;
|
||||
|
||||
|
||||
|
||||
public void startElement(String namespace,
|
||||
String simple,
|
||||
String qualified,
|
||||
Attributes attributes) {
|
||||
if (qualified.equals("page")) {
|
||||
title = null;
|
||||
body = null;
|
||||
time = null;
|
||||
id = null;
|
||||
} else if (qualified.equals("text")) {
|
||||
contents.setLength(0);
|
||||
} else if (qualified.equals("timestamp")) {
|
||||
contents.setLength(0);
|
||||
} else if (qualified.equals("title")) {
|
||||
contents.setLength(0);
|
||||
} else if (qualified.equals("id")) {
|
||||
contents.setLength(0);
|
||||
int elemType = getElementType(qualified);
|
||||
switch (elemType) {
|
||||
case PAGE:
|
||||
title = null;
|
||||
body = null;
|
||||
time = null;
|
||||
id = null;
|
||||
break;
|
||||
// intentional fall-through.
|
||||
case BODY:
|
||||
case DATE:
|
||||
case TITLE:
|
||||
case ID:
|
||||
contents.setLength(0);
|
||||
break;
|
||||
default:
|
||||
// this element should be discarded.
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -214,25 +243,34 @@ public class EnwikiDocMaker extends LineDocMaker {
|
|||
|
||||
public void endElement(String namespace, String simple, String qualified)
|
||||
throws SAXException {
|
||||
if (qualified.equals("title")) {
|
||||
title = contents.toString();
|
||||
} else if (qualified.equals("text")) {
|
||||
body = contents.toString();
|
||||
//workaround that startswith doesn't have an ignore case option, get at least 20 chars.
|
||||
String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
|
||||
if (startsWith.startsWith("#redirect")) {
|
||||
body = null;
|
||||
}
|
||||
} else if (qualified.equals("timestamp")) {
|
||||
time = time(contents.toString());
|
||||
} else if (qualified.equals("id") && id == null) {//just get the first id
|
||||
id = contents.toString();
|
||||
}
|
||||
else if (qualified.equals("page")) {
|
||||
//the body must be null and we either are keeping image docs or the title does not start with Image:
|
||||
if (body != null && (keepImages == true || title.startsWith("Image:") == false)) {
|
||||
create(title, time, body, id);
|
||||
}
|
||||
int elemType = getElementType(qualified);
|
||||
switch (elemType) {
|
||||
case PAGE:
|
||||
// the body must be null and we either are keeping image docs or the
|
||||
// title does not start with Image:
|
||||
if (body != null && (keepImages || !title.startsWith("Image:"))) {
|
||||
create(title, time, body, id);
|
||||
}
|
||||
break;
|
||||
case BODY:
|
||||
body = contents.toString();
|
||||
//workaround that startswith doesn't have an ignore case option, get at least 20 chars.
|
||||
String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
|
||||
if (startsWith.startsWith("#redirect")) {
|
||||
body = null;
|
||||
}
|
||||
break;
|
||||
case DATE:
|
||||
time = time(contents.toString());
|
||||
break;
|
||||
case TITLE:
|
||||
title = contents.toString();
|
||||
break;
|
||||
case ID:
|
||||
id = contents.toString();
|
||||
break;
|
||||
default:
|
||||
// this element should be discarded.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,38 +17,44 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.commons.compress.compressors.CompressorException;
|
||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
/**
|
||||
* A DocMaker reading one line at a time as a Document from
|
||||
* a single file. This saves IO cost (over DirDocMaker) of
|
||||
* recursing through a directory and opening a new file for
|
||||
* every document. It also re-uses its Document and Field
|
||||
* instance to improve indexing speed.
|
||||
*
|
||||
* A DocMaker reading one line at a time as a Document from a single file. This
|
||||
* saves IO cost (over DirDocMaker) of recursing through a directory and opening
|
||||
* a new file for every document. It also re-uses its Document and Field
|
||||
* instance to improve indexing speed.<br>
|
||||
* The expected format of each line is (arguments are separated by <TAB>):
|
||||
* <i>title, date, body</i>. If a line is read in a different format, a
|
||||
* {@link RuntimeException} will be thrown. In general, you should use this doc
|
||||
* maker with files that were created with {@link WriteLineDocTask}.<br><br>
|
||||
*
|
||||
* Config properties:
|
||||
* docs.file=<path to the file%gt;
|
||||
* doc.reuse.fields=true|false (default true)
|
||||
* doc.random.id.limit=N (default -1) -- create random
|
||||
* docid in the range 0..N; this is useful
|
||||
* with UpdateDoc to test updating random documents; if
|
||||
* this is unspecified or -1, then docid is sequentially
|
||||
* assigned
|
||||
* <ul>
|
||||
* <li>docs.file=<path to the file>
|
||||
* <li>doc.reuse.fields=true|false (default true)
|
||||
* <li>bzip.compression=true|false (default false)
|
||||
* <li>doc.random.id.limit=N (default -1) -- create random docid in the range
|
||||
* 0..N; this is useful with UpdateDoc to test updating random documents; if
|
||||
* this is unspecified or -1, then docid is sequentially assigned
|
||||
* </ul>
|
||||
*/
|
||||
public class LineDocMaker extends BasicDocMaker {
|
||||
|
||||
FileInputStream fileIS;
|
||||
InputStream fileIS;
|
||||
BufferedReader fileIn;
|
||||
ThreadLocal docState = new ThreadLocal();
|
||||
private String fileName;
|
||||
|
@ -57,9 +63,12 @@ public class LineDocMaker extends BasicDocMaker {
|
|||
private final DocState localDocState = new DocState();
|
||||
|
||||
private boolean doReuseFields = true;
|
||||
private boolean bzipCompressionEnabled = false;
|
||||
private Random r;
|
||||
private int numDocs;
|
||||
|
||||
private CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||
|
||||
class DocState {
|
||||
Document doc;
|
||||
Field bodyField;
|
||||
|
@ -93,7 +102,7 @@ public class LineDocMaker extends BasicDocMaker {
|
|||
doc.add(idField);
|
||||
}
|
||||
|
||||
final static String SEP = WriteLineDocTask.SEP;
|
||||
final static char SEP = WriteLineDocTask.SEP;
|
||||
|
||||
private int numDocsCreated;
|
||||
private synchronized int incrNumDocsCreated() {
|
||||
|
@ -101,27 +110,20 @@ public class LineDocMaker extends BasicDocMaker {
|
|||
}
|
||||
|
||||
public Document setFields(String line) {
|
||||
// A line must be in the following format. If it's not, fail !
|
||||
// title <TAB> date <TAB> body <NEWLINE>
|
||||
final String title, date, body;
|
||||
|
||||
int spot = line.indexOf(SEP);
|
||||
if (spot != -1) {
|
||||
title = line.substring(0, spot);
|
||||
int spot2 = line.indexOf(SEP, 1+spot);
|
||||
if (spot2 != -1) {
|
||||
date = line.substring(1+spot, spot2);
|
||||
body = line.substring(1+spot2, line.length());
|
||||
} else
|
||||
date = body = "";
|
||||
} else
|
||||
title = date = body = "";
|
||||
|
||||
final String docID;
|
||||
if (r != null) {
|
||||
docID = "doc" + r.nextInt(numDocs);
|
||||
} else {
|
||||
docID = "doc" + incrNumDocsCreated();
|
||||
if (spot == -1) {
|
||||
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
|
||||
}
|
||||
int spot2 = line.indexOf(SEP, 1 + spot);
|
||||
if (spot2 == -1) {
|
||||
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
|
||||
}
|
||||
final String title = line.substring(0, spot);
|
||||
final String date = line.substring(1+spot, spot2);
|
||||
final String body = line.substring(1+spot2, line.length());
|
||||
final String docID = "doc" + (r != null ? r.nextInt(numDocs) : incrNumDocsCreated());
|
||||
|
||||
if (doReuseFields) {
|
||||
idField.setValue(docID);
|
||||
|
@ -130,7 +132,10 @@ public class LineDocMaker extends BasicDocMaker {
|
|||
bodyField.setValue(body);
|
||||
return doc;
|
||||
} else {
|
||||
Field localIDField = new Field(BasicDocMaker.ID_FIELD, docID, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
|
||||
Field localIDField = new Field(BasicDocMaker.ID_FIELD,
|
||||
docID,
|
||||
Field.Store.YES,
|
||||
Field.Index.NOT_ANALYZED_NO_NORMS);
|
||||
|
||||
Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD,
|
||||
title,
|
||||
|
@ -174,16 +179,14 @@ public class LineDocMaker extends BasicDocMaker {
|
|||
|
||||
String line;
|
||||
synchronized(this) {
|
||||
while(true) {
|
||||
line = fileIn.readLine();
|
||||
if (line == null) {
|
||||
// Reset the file
|
||||
openFile();
|
||||
if (!forever)
|
||||
throw new NoMoreDataException();
|
||||
} else {
|
||||
break;
|
||||
line = fileIn.readLine();
|
||||
if (line == null) {
|
||||
if (!forever) {
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
// Reset the file
|
||||
openFile();
|
||||
return makeDocument();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -199,15 +202,24 @@ public class LineDocMaker extends BasicDocMaker {
|
|||
|
||||
public synchronized void resetInputs() {
|
||||
super.resetInputs();
|
||||
fileName = config.get("docs.file", null);
|
||||
if (fileName == null)
|
||||
throw new RuntimeException("docs.file must be set");
|
||||
openFile();
|
||||
}
|
||||
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
fileName = config.get("docs.file", null);
|
||||
if (fileName == null) {
|
||||
throw new IllegalArgumentException("docs.file must be set");
|
||||
}
|
||||
doReuseFields = config.get("doc.reuse.fields", true);
|
||||
String doBZCompress = config.get("bzip.compression", null);
|
||||
if (doBZCompress != null) {
|
||||
// Property was set, use the value.
|
||||
bzipCompressionEnabled = Boolean.valueOf(doBZCompress).booleanValue();
|
||||
} else {
|
||||
// Property was not set, attempt to detect based on file's extension
|
||||
bzipCompressionEnabled = fileName.endsWith("bz2");
|
||||
}
|
||||
numDocs = config.get("doc.random.id.limit", -1);
|
||||
if (numDocs != -1) {
|
||||
r = new Random(179);
|
||||
|
@ -216,16 +228,35 @@ public class LineDocMaker extends BasicDocMaker {
|
|||
|
||||
synchronized void openFile() {
|
||||
try {
|
||||
if (fileIn != null)
|
||||
if (fileIn != null) {
|
||||
fileIn.close();
|
||||
}
|
||||
fileIS = new FileInputStream(fileName);
|
||||
fileIn = new BufferedReader(new InputStreamReader(fileIS,"UTF-8"), READER_BUFFER_BYTES);
|
||||
if (bzipCompressionEnabled) {
|
||||
// According to BZip2CompressorInputStream's code, it reads the first
|
||||
// two file header chars ('B' and 'Z'). We only need to wrap the
|
||||
// underlying stream with a BufferedInputStream, since the code uses
|
||||
// the read() method exclusively.
|
||||
fileIS = new BufferedInputStream(fileIS, READER_BUFFER_BYTES);
|
||||
fileIS = csFactory.createCompressorInputStream("bzip2", fileIS);
|
||||
}
|
||||
// Wrap the stream with a BufferedReader for several reasons:
|
||||
// 1. We need the readLine() method.
|
||||
// 2. Even if bzip.compression is enabled, and is wrapped with
|
||||
// BufferedInputStream, wrapping with a buffer can still improve
|
||||
// performance, since the BIS buffer will be used to read from the
|
||||
// compressed stream, while the BR buffer will be used to read from the
|
||||
// uncompressed stream.
|
||||
fileIn = new BufferedReader(new InputStreamReader(fileIS, "UTF-8"), READER_BUFFER_BYTES);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (CompressorException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public int numUniqueTexts() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,18 +17,39 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
|
||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
|
||||
/**
|
||||
* A task which writes documents, one line per document. Each line is in the
|
||||
* following format: title <TAB> date <TAB> body. The output of this
|
||||
* taske can be consumed by
|
||||
* {@link org.apache.lucene.benchmark.byTask.feeds.LineDocMaker} and is intended
|
||||
* to save the IO overhead of opening a file per doument to be indexed.<br>
|
||||
*
|
||||
* Supports the following parameters:
|
||||
* <ul>
|
||||
* <li>line.file.out - the name of the file to write the output to. That
|
||||
* parameter is mandatory. <b>NOTE:</b> the file is re-created.
|
||||
* <li>bzip.compression - whether the output should be bzip-compressed. This is
|
||||
* recommended when the output file is expected to be large. (optional, default:
|
||||
* false).
|
||||
* <li>doc.writeline.log.step - controls how many records to process before
|
||||
* logging the status of the task. <b>NOTE:</b> to disable logging, set this
|
||||
* value to 0 or negative. (optional, default:1000).
|
||||
* </ul>
|
||||
*/
|
||||
public class WriteLineDocTask extends PerfTask {
|
||||
|
||||
/**
|
||||
|
@ -36,33 +57,48 @@ public class WriteLineDocTask extends PerfTask {
|
|||
* an "added N docs" message should be logged.
|
||||
*/
|
||||
public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000;
|
||||
|
||||
public WriteLineDocTask(PerfRunData runData) {
|
||||
super(runData);
|
||||
}
|
||||
public final static char SEP = '\t';
|
||||
|
||||
private int logStep = -1;
|
||||
private int docSize = 0;
|
||||
int count = 0;
|
||||
private BufferedWriter lineFileOut=null;
|
||||
private BufferedWriter lineFileOut = null;
|
||||
private DocMaker docMaker;
|
||||
|
||||
public final static String SEP = "\t";
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see PerfTask#setup()
|
||||
*/
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
if (lineFileOut==null) {
|
||||
Config config = getRunData().getConfig();
|
||||
String fileName = config.get("line.file.out", null);
|
||||
if (fileName == null)
|
||||
throw new Exception("line.file.out must be set");
|
||||
lineFileOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName),"UTF-8"));
|
||||
public WriteLineDocTask(PerfRunData runData) throws Exception {
|
||||
super(runData);
|
||||
Config config = runData.getConfig();
|
||||
String fileName = config.get("line.file.out", null);
|
||||
if (fileName == null) {
|
||||
throw new IllegalArgumentException("line.file.out must be set");
|
||||
}
|
||||
|
||||
OutputStream out = new FileOutputStream(fileName);
|
||||
boolean doBzipCompression = false;
|
||||
String doBZCompress = config.get("bzip.compression", null);
|
||||
if (doBZCompress != null) {
|
||||
// Property was set, use the value.
|
||||
doBzipCompression = Boolean.valueOf(doBZCompress).booleanValue();
|
||||
} else {
|
||||
// Property was not set, attempt to detect based on file's extension
|
||||
doBzipCompression = fileName.endsWith("bz2");
|
||||
}
|
||||
|
||||
if (doBzipCompression) {
|
||||
// Wrap with BOS since BZip2CompressorOutputStream calls out.write(int)
|
||||
// and does not use the write(byte[]) version. This proved to speed the
|
||||
// compression process by 70% !
|
||||
out = new BufferedOutputStream(out, 1 << 16);
|
||||
out = new CompressorStreamFactory().createCompressorOutputStream("bzip2", out);
|
||||
}
|
||||
lineFileOut = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16);
|
||||
docMaker = runData.getDocMaker();
|
||||
logStep = config.get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
|
||||
// To avoid the check 'if (logStep > 0)' in log(). This effectively turns
|
||||
// logging off.
|
||||
if (logStep <= 0) {
|
||||
logStep = Integer.MAX_VALUE;
|
||||
}
|
||||
docMaker = getRunData().getDocMaker();
|
||||
}
|
||||
|
||||
public void tearDown() throws Exception {
|
||||
|
@ -71,61 +107,52 @@ public class WriteLineDocTask extends PerfTask {
|
|||
}
|
||||
|
||||
public int doLogic() throws Exception {
|
||||
Document doc;
|
||||
if (docSize > 0) {
|
||||
doc = docMaker.makeDocument(docSize);
|
||||
} else {
|
||||
doc = docMaker.makeDocument();
|
||||
}
|
||||
Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument();
|
||||
|
||||
Field f = doc.getField(BasicDocMaker.BODY_FIELD);
|
||||
|
||||
String body, title, date;
|
||||
if (f != null)
|
||||
body = f.stringValue().replace('\t', ' ');
|
||||
else
|
||||
body = null;
|
||||
String body = f != null ? f.stringValue().replace('\t', ' ') : null;
|
||||
|
||||
f = doc.getField(BasicDocMaker.TITLE_FIELD);
|
||||
if (f != null)
|
||||
title = f.stringValue().replace('\t', ' ');
|
||||
else
|
||||
title = "";
|
||||
|
||||
f = doc.getField(BasicDocMaker.DATE_FIELD);
|
||||
if (f != null)
|
||||
date = f.stringValue().replace('\t', ' ');
|
||||
else
|
||||
date = "";
|
||||
|
||||
if (body != null) {
|
||||
f = doc.getField(BasicDocMaker.TITLE_FIELD);
|
||||
String title = f != null ? f.stringValue().replace('\t', ' ') : "";
|
||||
|
||||
f = doc.getField(BasicDocMaker.DATE_FIELD);
|
||||
String date = f != null ? f.stringValue().replace('\t', ' ') : "";
|
||||
|
||||
lineFileOut.write(title, 0, title.length());
|
||||
lineFileOut.write(SEP);
|
||||
lineFileOut.write(date, 0, date.length());
|
||||
lineFileOut.write(SEP);
|
||||
lineFileOut.write(body, 0, body.length());
|
||||
lineFileOut.newLine();
|
||||
lineFileOut.flush();
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
private void log (int count) {
|
||||
if (logStep<0) {
|
||||
// init once per instance
|
||||
logStep = getRunData().getConfig().get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
|
||||
}
|
||||
if (logStep>0 && (count%logStep)==0) {
|
||||
System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+" docs");
|
||||
private void log(int count) {
|
||||
// logStep is initialized in the ctor to a positive value. If the config
|
||||
// file indicates no logging, or contains an invalid value, logStep is init
|
||||
// to Integer.MAX_VALUE, so that logging will not occur (at least for the
|
||||
// first Integer.MAX_VALUE records).
|
||||
if (count % logStep == 0) {
|
||||
System.out.println("--> " + Thread.currentThread().getName()
|
||||
+ " processed (write line) " + count + " docs");
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws Exception {
|
||||
lineFileOut.close();
|
||||
super.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the params (docSize only)
|
||||
* @param params docSize, or 0 for no limit.
|
||||
*/
|
||||
public void setParams(String params) {
|
||||
super.setParams(params);
|
||||
if (super.supportsParams()) {
|
||||
super.setParams(params);
|
||||
}
|
||||
docSize = (int) Float.parseFloat(params);
|
||||
}
|
||||
|
||||
|
@ -135,4 +162,5 @@ public class WriteLineDocTask extends PerfTask {
|
|||
public boolean supportsParams() {
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
package org.apache.lucene.benchmark;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/** Base class for all Benchmark unit tests. */
|
||||
public class BenchmarkTestCase extends TestCase {
|
||||
|
||||
private static final File workDir;
|
||||
|
||||
static {
|
||||
workDir = new File(System.getProperty("benchmark.work.dir", "test/benchmark")).getAbsoluteFile();
|
||||
workDir.mkdirs();
|
||||
}
|
||||
|
||||
public File getWorkDir() {
|
||||
return workDir;
|
||||
}
|
||||
|
||||
}
|
|
@ -17,188 +17,33 @@
|
|||
|
||||
package org.apache.lucene.benchmark.byTask;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.TaskSequence;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Algorithm;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.StringReader;
|
||||
import java.lang.reflect.Modifier;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Test very simply that perf tasks are parses as expected.
|
||||
*/
|
||||
/** Test very simply that perf tasks are parses as expected. */
|
||||
public class TestPerfTasksParse extends TestCase {
|
||||
|
||||
private static final boolean DEBUG = false;
|
||||
static final String NEW_LINE = System.getProperty("line.separator");
|
||||
static final String INDENT = " ";
|
||||
|
||||
// properties in effect in all tests here
|
||||
static final String propPart =
|
||||
INDENT+"directory=RAMDirectory" + NEW_LINE +
|
||||
INDENT+"print.props=false" + NEW_LINE
|
||||
INDENT + "directory=RAMDirectory" + NEW_LINE +
|
||||
INDENT + "print.props=false" + NEW_LINE
|
||||
;
|
||||
|
||||
/*
|
||||
* All known tasks.
|
||||
* As new tasks are added, add them here.
|
||||
* It would be nice to do that automatically, unfortunately
|
||||
* Java does not provide a "get all classes in package" or
|
||||
* "get all sub-classes" functionality.
|
||||
*/
|
||||
static String singleTaskAlgs [];
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see junit.framework.TestCase#setUp()
|
||||
*/
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
if (singleTaskAlgs==null) {
|
||||
singleTaskAlgs = findTasks();
|
||||
}
|
||||
}
|
||||
|
||||
// one time initialization
|
||||
static String [] findTasks () throws Exception {
|
||||
ArrayList tsks = new ArrayList();
|
||||
// init with tasks we know about
|
||||
tsks.add( " AddDoc " );
|
||||
tsks.add( " AddDoc(1000.0) " );
|
||||
tsks.add( " ClearStats " );
|
||||
tsks.add( " CloseIndex " );
|
||||
tsks.add( " CloseReader " );
|
||||
tsks.add( " CreateIndex " );
|
||||
tsks.add( " DeleteDoc " );
|
||||
tsks.add( " DeleteDoc(500.0) " );
|
||||
tsks.add( " NewRound " );
|
||||
tsks.add( " OpenIndex " );
|
||||
tsks.add( " OpenReader " );
|
||||
tsks.add( " Optimize " );
|
||||
tsks.add( " RepAll " );
|
||||
tsks.add( " RepSelectByPref prefix " );
|
||||
tsks.add( " RepSumByNameRound " );
|
||||
tsks.add( " RepSumByName " );
|
||||
tsks.add( " RepSumByPrefRound prefix " );
|
||||
tsks.add( " RepSumByPref prefix " );
|
||||
tsks.add( " ResetInputs " );
|
||||
tsks.add( " ResetSystemErase " );
|
||||
tsks.add( " ResetSystemSoft " );
|
||||
tsks.add( " Search " );
|
||||
tsks.add( " SearchTravRet " );
|
||||
tsks.add( " SearchTravRet(100.0) " );
|
||||
tsks.add( " SearchTrav " );
|
||||
tsks.add( " SearchTrav(50.0) " );
|
||||
tsks.add( " SetProp " );
|
||||
tsks.add( " SetProp(name,value) " );
|
||||
tsks.add( " Warm " );
|
||||
tsks.add( "SearchTravRetLoadFieldSelector");
|
||||
tsks.add("SearchTravRetLoadFieldSelector(body,title)");
|
||||
|
||||
// if tasks.dir property is defined, look for additional tasks.
|
||||
// this somewhat covers tasks that would be added in the future, in case
|
||||
// the list above is not updated to cover them.
|
||||
// some tasks would be tested more than once this way, but that's ok.
|
||||
String tasksDir = System.getProperty("tasks.dir");
|
||||
if (tasksDir !=null) {
|
||||
String pkgPrefix = PerfTask.class.getPackage().getName()+".";
|
||||
String taskNames[] = new File(tasksDir).list();
|
||||
for (int i = 0; i < taskNames.length; i++) {
|
||||
String name = taskNames[i].trim();
|
||||
if (!name.endsWith("Task.class"))
|
||||
continue; // Task class file only
|
||||
name = name.substring(0,name.length()-6);
|
||||
Class cls = Class.forName(pkgPrefix+name);
|
||||
if (Modifier.isAbstract(cls.getModifiers()) || Modifier.isInterface(cls.getModifiers()))
|
||||
continue; // skip sbstract classes
|
||||
if (!PerfTask.class.isAssignableFrom(cls))
|
||||
continue; // not a task
|
||||
name = name.substring(0,name.length()-4);
|
||||
if (name.startsWith("Rep") && name.indexOf("Pref")>=0)
|
||||
name += " prefix";
|
||||
tsks.add(" "+name+" ");
|
||||
}
|
||||
}
|
||||
return (String[]) tsks.toArray(new String[0]);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param name test name
|
||||
*/
|
||||
public TestPerfTasksParse(String name) {
|
||||
super(name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the parsing of very simple tasks, for all tasks
|
||||
*/
|
||||
public void testAllTasksSimpleParse() {
|
||||
doTestAllTasksSimpleParse(false,false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the parsing of simple sequential sequences, for all tasks
|
||||
*/
|
||||
public void testAllTasksSimpleParseSequntial() {
|
||||
doTestAllTasksSimpleParse(true,false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the parsing of simple parallel sequences, for all tasks
|
||||
*/
|
||||
public void testAllTasksSimpleParseParallel() {
|
||||
doTestAllTasksSimpleParse(true,true);
|
||||
}
|
||||
|
||||
// utility for simple parsing testing of all tasks.
|
||||
private void doTestAllTasksSimpleParse(boolean parOrSeq, boolean par) {
|
||||
for (int i = 0; i < singleTaskAlgs.length; i++) {
|
||||
String testedTask = singleTaskAlgs[i];
|
||||
if (parOrSeq) {
|
||||
if (par) {
|
||||
testedTask = "[ " + testedTask + " ] : 2";
|
||||
} else {
|
||||
testedTask = "{ " + testedTask + " } : 3";
|
||||
}
|
||||
}
|
||||
try {
|
||||
String algText = propPart+INDENT+testedTask;
|
||||
logTstParsing(algText);
|
||||
Benchmark benchmark = new Benchmark(new StringReader(algText));
|
||||
Algorithm alg = benchmark.getAlgorithm();
|
||||
ArrayList algTasks = alg.extractTasks();
|
||||
// must find a task with this name in the algorithm
|
||||
boolean foundName = false;
|
||||
boolean foundPar = false;
|
||||
String theTask = singleTaskAlgs[i].replaceAll(" +"," ").trim();
|
||||
for (Iterator iter = algTasks.iterator(); iter.hasNext();) {
|
||||
PerfTask task = (PerfTask) iter.next();
|
||||
foundName |= (task.toString().indexOf(theTask)>=0);
|
||||
foundPar |= (task instanceof TaskSequence && ((TaskSequence)task).isParallel());
|
||||
}
|
||||
assertTrue("Task "+testedTask+" was not found in "+alg.toString(),foundName);
|
||||
if (parOrSeq) {
|
||||
if (par) {
|
||||
assertTrue("Task "+testedTask+" was supposed to be parallel in "+alg.toString(),foundPar);
|
||||
} else {
|
||||
assertFalse("Task "+testedTask+" was not supposed to be parallel in "+alg.toString(),foundPar);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.out.flush();
|
||||
e.printStackTrace();
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the repetiotion parsing for parallel tasks
|
||||
*/
|
||||
/** Test the repetiotion parsing for parallel tasks */
|
||||
public void testParseParallelTaskSequenceRepetition() throws Exception {
|
||||
String taskStr = "AddDoc";
|
||||
String parsedTasks = "[ "+taskStr+" ] : 1000";
|
||||
|
@ -219,9 +64,7 @@ public class TestPerfTasksParse extends TestCase {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the repetiotion parsing for sequential tasks
|
||||
*/
|
||||
/** Test the repetiotion parsing for sequential tasks */
|
||||
public void testParseTaskSequenceRepetition() throws Exception {
|
||||
String taskStr = "AddDoc";
|
||||
String parsedTasks = "{ "+taskStr+" } : 1000";
|
||||
|
@ -242,11 +85,4 @@ public class TestPerfTasksParse extends TestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private void logTstParsing (String txt) {
|
||||
if (!DEBUG)
|
||||
return;
|
||||
System.out.println("Test parsing of");
|
||||
System.out.println(txt);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,169 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.AddDocTask;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.TaskSequence;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
|
||||
/** Tests the functionality of {@link LineDocMaker}. */
|
||||
public class LineDocMakerTest extends BenchmarkTestCase {
|
||||
|
||||
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||
|
||||
private void createBZ2LineFile(File file) throws Exception {
|
||||
OutputStream out = new FileOutputStream(file);
|
||||
out = csFactory.createCompressorOutputStream("bzip2", out);
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
|
||||
StringBuffer doc = new StringBuffer();
|
||||
doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
|
||||
writer.write(doc.toString());
|
||||
writer.newLine();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
private void createRegularLineFile(File file) throws Exception {
|
||||
OutputStream out = new FileOutputStream(file);
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
|
||||
StringBuffer doc = new StringBuffer();
|
||||
doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
|
||||
writer.write(doc.toString());
|
||||
writer.newLine();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
private void doIndexAndSearchTest(File file, boolean setBZCompress,
|
||||
String bz2CompressVal) throws Exception {
|
||||
|
||||
Properties props = new Properties();
|
||||
|
||||
// LineDocMaker specific settings.
|
||||
props.setProperty("docs.file", file.getAbsolutePath());
|
||||
if (setBZCompress) {
|
||||
props.setProperty("bzip.compression", bz2CompressVal);
|
||||
}
|
||||
|
||||
// Indexing configuration.
|
||||
props.setProperty("analyzer", SimpleAnalyzer.class.getName());
|
||||
props.setProperty("doc.maker", LineDocMaker.class.getName());
|
||||
props.setProperty("directory", "RAMDirectory");
|
||||
|
||||
// Create PerfRunData
|
||||
Config config = new Config(props);
|
||||
PerfRunData runData = new PerfRunData(config);
|
||||
|
||||
TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false);
|
||||
tasks.addTask(new CreateIndexTask(runData));
|
||||
tasks.addTask(new AddDocTask(runData));
|
||||
tasks.addTask(new CloseIndexTask(runData));
|
||||
tasks.doLogic();
|
||||
|
||||
IndexSearcher searcher = new IndexSearcher(runData.getDirectory());
|
||||
TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10);
|
||||
assertEquals(1, td.totalHits);
|
||||
assertNotNull(td.scoreDocs[0]);
|
||||
searcher.close();
|
||||
}
|
||||
|
||||
/* Tests LineDocMaker with a bzip2 input stream. */
|
||||
public void testBZip2() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
createBZ2LineFile(file);
|
||||
doIndexAndSearchTest(file, true, "true");
|
||||
}
|
||||
|
||||
public void testBZip2AutoDetect() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
createBZ2LineFile(file);
|
||||
doIndexAndSearchTest(file, false, null);
|
||||
}
|
||||
|
||||
public void testBZip2WithBzipCompressionDisabled() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
createBZ2LineFile(file);
|
||||
|
||||
try {
|
||||
doIndexAndSearchTest(file, true, "false");
|
||||
fail("Some exception should have been thrown !");
|
||||
} catch (Exception e) {
|
||||
// expected.
|
||||
}
|
||||
}
|
||||
|
||||
public void testRegularFile() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
createRegularLineFile(file);
|
||||
doIndexAndSearchTest(file, false, null);
|
||||
}
|
||||
|
||||
public void testRegularFileWithBZipCompressionEnabled() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
createRegularLineFile(file);
|
||||
|
||||
try {
|
||||
doIndexAndSearchTest(file, true, "true");
|
||||
fail("Some exception should have been thrown !");
|
||||
} catch (Exception e) {
|
||||
// expected.
|
||||
}
|
||||
}
|
||||
|
||||
public void testInvalidFormat() throws Exception {
|
||||
String[] testCases = new String[] {
|
||||
"", // empty line
|
||||
"title", // just title
|
||||
"title" + WriteLineDocTask.SEP, // title + SEP
|
||||
"title" + WriteLineDocTask.SEP + "body", // title + SEP + body
|
||||
// note that title + SEP + body + SEP is a valid line, which results in an
|
||||
// empty body
|
||||
};
|
||||
|
||||
for (int i = 0; i < testCases.length; i++) {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8"));
|
||||
writer.write(testCases[i]);
|
||||
writer.newLine();
|
||||
writer.close();
|
||||
try {
|
||||
doIndexAndSearchTest(file, false, null);
|
||||
fail("Some exception should have been thrown for: [" + testCases[i] + "]");
|
||||
} catch (Exception e) {
|
||||
// expected.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
package org.apache.lucene.benchmark.byTask.tasks;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.Index;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
|
||||
/** Tests the functionality of {@link WriteLineDocTask}. */
|
||||
public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
||||
|
||||
// class has to be public so that Class.forName.newInstance() will work
|
||||
public static final class WriteLineDocMaker extends BasicDocMaker {
|
||||
|
||||
protected DocData getNextDocData() throws NoMoreDataException, Exception {
|
||||
throw new UnsupportedOperationException("not implemented");
|
||||
}
|
||||
|
||||
public Document makeDocument() throws Exception {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field(BODY_FIELD, "body", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
|
||||
doc.add(new Field(TITLE_FIELD, "title", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
|
||||
doc.add(new Field(DATE_FIELD, "date", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
|
||||
return doc;
|
||||
}
|
||||
|
||||
public int numUniqueTexts() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||
|
||||
private PerfRunData createPerfRunData(File file, boolean setBZCompress, String bz2CompressVal) throws Exception {
|
||||
Properties props = new Properties();
|
||||
props.setProperty("doc.maker", WriteLineDocMaker.class.getName());
|
||||
props.setProperty("line.file.out", file.getAbsolutePath());
|
||||
if (setBZCompress) {
|
||||
props.setProperty("bzip.compression", bz2CompressVal);
|
||||
}
|
||||
props.setProperty("directory", "RAMDirectory"); // no accidental FS dir.
|
||||
Config config = new Config(props);
|
||||
return new PerfRunData(config);
|
||||
}
|
||||
|
||||
private void doReadTest(File file, boolean bz2File) throws Exception {
|
||||
InputStream in = new FileInputStream(file);
|
||||
if (bz2File) {
|
||||
in = csFactory.createCompressorInputStream("bzip2", in);
|
||||
}
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
|
||||
try {
|
||||
String line = br.readLine();
|
||||
assertNotNull(line);
|
||||
String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
|
||||
assertEquals(3, parts.length);
|
||||
assertEquals("title", parts[0]);
|
||||
assertEquals("date", parts[1]);
|
||||
assertEquals("body", parts[2]);
|
||||
assertNull(br.readLine());
|
||||
} finally {
|
||||
br.close();
|
||||
}
|
||||
}
|
||||
|
||||
/* Tests WriteLineDocTask with a bzip2 format. */
|
||||
public void testBZip2() throws Exception {
|
||||
|
||||
// Create a document in bz2 format.
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
PerfRunData runData = createPerfRunData(file, true, "true");
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
||||
doReadTest(file, true);
|
||||
}
|
||||
|
||||
public void testBZip2AutoDetect() throws Exception {
|
||||
|
||||
// Create a document in bz2 format.
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
PerfRunData runData = createPerfRunData(file, false, null);
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
||||
doReadTest(file, true);
|
||||
}
|
||||
|
||||
public void testRegularFile() throws Exception {
|
||||
|
||||
// Create a document in regular format.
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, true, "false");
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
||||
doReadTest(file, false);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue