LUCENE-2958: WriteLineDocTask improvements - flexible line fields definition - port/merge from 3x.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1083816 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Doron Cohen 2011-03-21 14:59:42 +00:00
parent 99c13a4e29
commit a9fda446c3
6 changed files with 467 additions and 66 deletions

View File

@ -2,6 +2,13 @@ Lucene Benchmark Contrib Change Log
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
03/21/2011
LUCENE-2958: WriteLineDocTask improvements - allow to emit line docs also for empty
docs, and be flexible about which fields are added to the line file. For this, a header
line was added to the line file. That header is examined by LineDocSource. Old line
files which have no header line are handled as before, imposing the default header.
(Doron Cohen, Shai Erera, Mike McCandless)
03/21/2011
LUCENE-2964: Allow benchmark tasks from alternative packages,
specified through a new property "alt.tasks.packages".

View File

@ -22,6 +22,9 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.lang.reflect.Constructor;
import java.util.Arrays;
import java.util.Properties;
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.benchmark.byTask.utils.Config;
@ -40,16 +43,136 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
* <ul>
* <li>docs.file=&lt;path to the file&gt;
* <li>content.source.encoding - default to UTF-8.
* <li>line.parser - default to {@link HeaderLineParser} if a header line exists which differs
* from {@link WriteLineDocTask#DEFAULT_FIELDS} and to {@link SimpleLineParser} otherwise.
* </ul>
*/
public class LineDocSource extends ContentSource {
private final static char SEP = WriteLineDocTask.SEP;
/** Reader of a single input line into {@link DocData}. */
public static abstract class LineParser {
protected final String[] header;
/** Construct with the header
* @param header header line found in the input file, or null if none
*/
public LineParser(String[] header) {
this.header = header;
}
/** parse an input line and fill doc data appropriately */
public abstract void parseLine(DocData docData, String line);
}
/**
* {@link LineParser} which ignores the header passed to its constructor
* and assumes simply that field names and their order are the same
* as in {@link WriteLineDocTask#DEFAULT_FIELDS}
*/
public static class SimpleLineParser extends LineParser {
public SimpleLineParser(String[] header) {
super(header);
}
public void parseLine(DocData docData, String line) {
int k1 = 0;
int k2 = line.indexOf(WriteLineDocTask.SEP, k1);
if (k2<0) {
throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator title::date)!");
}
docData.setTitle(line.substring(k1,k2));
k1 = k2+1;
k2 = line.indexOf(WriteLineDocTask.SEP, k1);
if (k2<0) {
throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator date::body)!");
}
docData.setDate(line.substring(k1,k2));
k1 = k2+1;
k2 = line.indexOf(WriteLineDocTask.SEP, k1);
if (k2>=0) {
throw new RuntimeException("line: [" + line + "] is in an invalid format (too many separators)!");
}
// last one
docData.setBody(line.substring(k1));
}
}
/**
* {@link LineParser} which sets field names and order by
* the header - any header - of the lines file.
* It is less efficient than {@link SimpleLineParser} but more powerful.
*/
public static class HeaderLineParser extends LineParser {
private enum FieldName { NAME , TITLE , DATE , BODY, PROP }
private final FieldName[] posToF;
public HeaderLineParser(String[] header) {
super(header);
posToF = new FieldName[header.length];
for (int i=0; i<header.length; i++) {
String f = header[i];
if (DocMaker.NAME_FIELD.equals(f)) {
posToF[i] = FieldName.NAME;
} else if (DocMaker.TITLE_FIELD.equals(f)) {
posToF[i] = FieldName.TITLE;
} else if (DocMaker.DATE_FIELD.equals(f)) {
posToF[i] = FieldName.DATE;
} else if (DocMaker.BODY_FIELD.equals(f)) {
posToF[i] = FieldName.BODY;
} else {
posToF[i] = FieldName.PROP;
}
}
}
public void parseLine(DocData docData, String line) {
int n = 0;
int k1 = 0;
int k2;
while ((k2 = line.indexOf(WriteLineDocTask.SEP, k1)) >= 0) {
if (n>=header.length) {
throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
}
setDocDataField(docData, n, line.substring(k1,k2));
++n;
k1 = k2 + 1;
}
if (n!=header.length-1) {
throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
}
// last one
setDocDataField(docData, n, line.substring(k1));
}
private void setDocDataField(DocData docData, int position, String text) {
switch(posToF[position]) {
case NAME:
docData.setName(text);
break;
case TITLE:
docData.setTitle(text);
break;
case DATE:
docData.setDate(text);
break;
case BODY:
docData.setBody(text);
break;
case PROP:
Properties p = docData.getProps();
if (p==null) {
p = new Properties();
docData.setProps(p);
}
p.setProperty(header[position], text);
break;
}
}
}
private File file;
private BufferedReader reader;
private int readCount;
private LineParser docDataLineReader = null;
private boolean skipHeaderLine = false;
private synchronized void openFile() {
try {
if (reader != null) {
@ -57,6 +180,9 @@ public class LineDocSource extends ContentSource {
}
InputStream is = getInputStream(file);
reader = new BufferedReader(new InputStreamReader(is, encoding), BUFFER_SIZE);
if (skipHeaderLine) {
reader.readLine(); // skip one line - the header line - already handled that info
}
} catch (IOException e) {
throw new RuntimeException(e);
}
@ -77,7 +203,6 @@ public class LineDocSource extends ContentSource {
synchronized(this) {
line = reader.readLine();
myID = readCount++;
if (line == null) {
if (!forever) {
throw new NoMoreDataException();
@ -86,27 +211,54 @@ public class LineDocSource extends ContentSource {
openFile();
return getNextDocData(docData);
}
if (docDataLineReader == null) { // first line ever, one time initialization,
docDataLineReader = createDocDataLineReader(line);
if (skipHeaderLine) {
return getNextDocData(docData);
}
}
// increment IDS only once...
myID = readCount++;
}
// A line must be in the following format. If it's not, fail !
// title <TAB> date <TAB> body <NEWLINE>
int spot = line.indexOf(SEP);
if (spot == -1) {
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
}
int spot2 = line.indexOf(SEP, 1 + spot);
if (spot2 == -1) {
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
}
// The date String was written in the format of DateTools.dateToString.
docData.clear();
docData.setID(myID);
docData.setBody(line.substring(1 + spot2, line.length()));
docData.setTitle(line.substring(0, spot));
docData.setDate(line.substring(1 + spot, spot2));
docDataLineReader.parseLine(docData, line);
return docData;
}
private LineParser createDocDataLineReader(String line) {
String[] header;
String headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP;
if (line.startsWith(headIndicator)) {
header = line.substring(headIndicator.length()).split(Character.toString(WriteLineDocTask.SEP));
skipHeaderLine = true; // mark to skip the header line when input file is reopened
} else {
header = WriteLineDocTask.DEFAULT_FIELDS;
}
// if a specific DocDataLineReader was configured, must respect it
String docDataLineReaderClassName = getConfig().get("line.parser", null);
if (docDataLineReaderClassName!=null) {
try {
final Class<? extends LineParser> clazz =
Class.forName(docDataLineReaderClassName).asSubclass(LineParser.class);
Constructor<? extends LineParser> cnstr = clazz.getConstructor(new Class[]{String[].class});
return cnstr.newInstance((Object)header);
} catch (Exception e) {
throw new RuntimeException("Failed to instantiate "+docDataLineReaderClassName, e);
}
}
// if this the simple case,
if (Arrays.deepEquals(header, WriteLineDocTask.DEFAULT_FIELDS)) {
return new SimpleLineParser(header);
}
return new HeaderLineParser(header);
}
@Override
public void resetInputs() throws IOException {
super.resetInputs();

View File

@ -23,6 +23,8 @@ import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -41,11 +43,17 @@ import org.apache.lucene.document.Field;
* to save the IO overhead of opening a file per document to be indexed.<br>
* Supports the following parameters:
* <ul>
* <li>line.file.out - the name of the file to write the output to. That
* <li><b>line.file.out<b> - the name of the file to write the output to. That
* parameter is mandatory. <b>NOTE:</b> the file is re-created.
* <li>bzip.compression - whether the output should be bzip-compressed. This is
* recommended when the output file is expected to be large. (optional, default:
* false).
* <li><b>bzip.compression<b> - whether the output should be bzip-compressed. This is
* recommended when the output file is expected to be large.
* <li><b>line.fields<b> - which fields should be written in each line.
* (optional, default: {@link #DEFAULT_FIELDS}).
* <li><b>sufficient.fields</b> - list of field names, separated by comma, which,
* if all of them are missing, the document will be skipped. For example, to require
* that at least one of f1,f2 is not empty, specify: "f1,f2" in this field. To specify
* that no field is required, i.e. that even empty docs should be emitted, specify <b>","</b>.
* (optional, default: {@link #DEFAULT_SUFFICIENT_FIELDS}).
* </ul>
* <b>NOTE:</b> this class is not thread-safe and if used by multiple threads the
* output is unspecified (as all will write to the same output file in a
@ -53,13 +61,32 @@ import org.apache.lucene.document.Field;
*/
public class WriteLineDocTask extends PerfTask {
public static final String FIELDS_HEADER_INDICATOR = "FIELDS_HEADER_INDICATOR###";
public final static char SEP = '\t';
/**
* Fields to be written by default
*/
public static final String[] DEFAULT_FIELDS = new String[] {
DocMaker.TITLE_FIELD,
DocMaker.DATE_FIELD,
DocMaker.BODY_FIELD,
};
/**
* Default fields which at least one of them is required to not skip the doc.
*/
public static final String DEFAULT_SUFFICIENT_FIELDS = DocMaker.TITLE_FIELD +',' + DocMaker.BODY_FIELD;
private int docSize = 0;
private PrintWriter lineFileOut = null;
private DocMaker docMaker;
private ThreadLocal<StringBuilder> threadBuffer = new ThreadLocal<StringBuilder>();
private ThreadLocal<Matcher> threadNormalizer = new ThreadLocal<Matcher>();
private final String[] fieldsToWrite;;
private final boolean[] sufficientFields;
private final boolean checkSufficientFields;
public WriteLineDocTask(PerfRunData runData) throws Exception {
super(runData);
@ -89,6 +116,51 @@ public class WriteLineDocTask extends PerfTask {
}
lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16));
docMaker = runData.getDocMaker();
// init fields
String f2r = config.get("line.fields",null);
if (f2r == null) {
fieldsToWrite = DEFAULT_FIELDS;
} else {
if (f2r.indexOf(SEP)>=0) {
throw new IllegalArgumentException("line.fields "+f2r+" should not contain the separator char: "+SEP);
}
fieldsToWrite = f2r.split(",");
}
// init sufficient fields
sufficientFields = new boolean[fieldsToWrite.length];
String suff = config.get("sufficient.fields",DEFAULT_SUFFICIENT_FIELDS);
if (",".equals(suff)) {
checkSufficientFields = false;
} else {
checkSufficientFields = true;
HashSet<String> sf = new HashSet<String>(Arrays.asList(suff.split(",")));
for (int i=0; i<fieldsToWrite.length; i++) {
if (sf.contains(fieldsToWrite[i])) {
sufficientFields[i] = true;
}
}
}
writeHeader();
}
/**
* Write a header to the lines file - indicating how to read the file later
*/
private void writeHeader() {
StringBuilder sb = threadBuffer.get();
if (sb == null) {
sb = new StringBuilder();
threadBuffer.set(sb);
}
sb.setLength(0);
sb.append(FIELDS_HEADER_INDICATOR);
for (String f : fieldsToWrite) {
sb.append(SEP).append(f);
}
lineFileOut.println(sb.toString());
}
@Override
@ -106,27 +178,26 @@ public class WriteLineDocTask extends PerfTask {
threadNormalizer.set(matcher);
}
Field f = doc.getField(DocMaker.BODY_FIELD);
String body = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : "";
f = doc.getField(DocMaker.TITLE_FIELD);
String title = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : "";
if (body.length() > 0 || title.length() > 0) {
f = doc.getField(DocMaker.DATE_FIELD);
String date = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : "";
StringBuilder sb = threadBuffer.get();
if (sb == null) {
sb = new StringBuilder();
threadBuffer.set(sb);
}
sb.setLength(0);
sb.append(title).append(SEP).append(date).append(SEP).append(body);
StringBuilder sb = threadBuffer.get();
if (sb == null) {
sb = new StringBuilder();
threadBuffer.set(sb);
}
sb.setLength(0);
boolean sufficient = !checkSufficientFields;
for (int i=0; i<fieldsToWrite.length; i++) {
Field f = doc.getField(fieldsToWrite[i]);
String text = f == null ? "" : matcher.reset(f.stringValue()).replaceAll(" ").trim();
sb.append(text).append(SEP);
sufficient |= text.length()>0 && sufficientFields[i];
}
if (sufficient) {
sb.setLength(sb.length()-1); // remove redundant last separator
// lineFileOut is a PrintWriter, which synchronizes internally in println.
lineFileOut.println(sb.toString());
}
return 1;
}

View File

@ -36,6 +36,7 @@ import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.collation.CollationKeyAnalyzer;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldsEnum;
@ -393,8 +394,13 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
BufferedReader r = new BufferedReader(new FileReader(lineFile));
int numLines = 0;
while(r.readLine() != null)
String line;
while((line = r.readLine()) != null) {
if (numLines==0 && line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR)) {
continue; // do not count the header line as a doc
}
numLines++;
}
r.close();
assertEquals("did not see the right number of docs; should be " + NUM_TRY_DOCS + " but was " + numLines, NUM_TRY_DOCS, numLines);

View File

@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.byTask.feeds;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.Properties;
@ -28,6 +29,8 @@ import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.HeaderLineParser;
import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.LineParser;
import org.apache.lucene.benchmark.byTask.tasks.AddDocTask;
import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask;
import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask;
@ -44,29 +47,71 @@ public class LineDocSourceTest extends BenchmarkTestCase {
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
private void createBZ2LineFile(File file) throws Exception {
private void createBZ2LineFile(File file, boolean addHeader) throws Exception {
OutputStream out = new FileOutputStream(file);
out = csFactory.createCompressorOutputStream("bzip2", out);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
StringBuilder doc = new StringBuilder();
doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
writer.write(doc.toString());
writer.newLine();
writeDocsToFile(writer, addHeader, null);
writer.close();
}
private void createRegularLineFile(File file) throws Exception {
OutputStream out = new FileOutputStream(file);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
private void writeDocsToFile(BufferedWriter writer, boolean addHeader, Properties otherFields) throws IOException {
if (addHeader) {
writer.write(WriteLineDocTask.FIELDS_HEADER_INDICATOR);
writer.write(WriteLineDocTask.SEP);
writer.write(DocMaker.TITLE_FIELD);
writer.write(WriteLineDocTask.SEP);
writer.write(DocMaker.DATE_FIELD);
writer.write(WriteLineDocTask.SEP);
writer.write(DocMaker.BODY_FIELD);
if (otherFields!=null) {
// additional field names in the header
for (Object fn : otherFields.keySet()) {
writer.write(WriteLineDocTask.SEP);
writer.write(fn.toString());
}
}
writer.newLine();
}
StringBuilder doc = new StringBuilder();
doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append(DocMaker.BODY_FIELD);
if (otherFields!=null) {
// additional field values in the doc line
for (Object fv : otherFields.values()) {
doc.append(WriteLineDocTask.SEP).append(fv.toString());
}
}
writer.write(doc.toString());
writer.newLine();
}
private void createRegularLineFile(File file, boolean addHeader) throws Exception {
OutputStream out = new FileOutputStream(file);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
writeDocsToFile(writer, addHeader, null);
writer.close();
}
private void createRegularLineFileWithMoreFields(File file, String...extraFields) throws Exception {
OutputStream out = new FileOutputStream(file);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
Properties p = new Properties();
for (String f : extraFields) {
p.setProperty(f, f);
}
writeDocsToFile(writer, true, p);
writer.close();
}
private void doIndexAndSearchTest(File file, boolean setBZCompress,
String bz2CompressVal) throws Exception {
String bz2CompressVal, Class<? extends LineParser> lineParserClass, String storedField) throws Exception {
doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 1, storedField); // no extra repetitions
doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 2, storedField); // 1 extra repetition
doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 4, storedField); // 3 extra repetitions
}
private void doIndexAndSearchTestWithRepeats(File file, boolean setBZCompress,
String bz2CompressVal, Class<? extends LineParser> lineParserClass, int numAdds, String storedField) throws Exception {
Properties props = new Properties();
@ -75,11 +120,16 @@ public class LineDocSourceTest extends BenchmarkTestCase {
if (setBZCompress) {
props.setProperty("bzip.compression", bz2CompressVal);
}
if (lineParserClass != null) {
props.setProperty("line.parser", lineParserClass.getName());
}
// Indexing configuration.
props.setProperty("analyzer", MockAnalyzer.class.getName());
props.setProperty("content.source", LineDocSource.class.getName());
props.setProperty("directory", "RAMDirectory");
props.setProperty("doc.stored", "true");
props.setProperty("doc.index.props", "true");
// Create PerfRunData
Config config = new Config(props);
@ -87,34 +137,60 @@ public class LineDocSourceTest extends BenchmarkTestCase {
TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false);
tasks.addTask(new CreateIndexTask(runData));
tasks.addTask(new AddDocTask(runData));
for (int i=0; i<numAdds; i++) {
tasks.addTask(new AddDocTask(runData));
}
tasks.addTask(new CloseIndexTask(runData));
tasks.doLogic();
IndexSearcher searcher = new IndexSearcher(runData.getDirectory(), true);
TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10);
assertEquals(1, td.totalHits);
assertEquals(numAdds, td.totalHits);
assertNotNull(td.scoreDocs[0]);
if (storedField==null) {
storedField = DocMaker.BODY_FIELD; // added to all docs and satisfies field-name == value
}
assertEquals("Wrong field value", storedField, searcher.doc(0).get(storedField));
searcher.close();
}
/* Tests LineDocSource with a bzip2 input stream. */
public void testBZip2() throws Exception {
File file = new File(getWorkDir(), "one-line.bz2");
createBZ2LineFile(file);
doIndexAndSearchTest(file, true, "true");
createBZ2LineFile(file,true);
doIndexAndSearchTest(file, true, "true", null, null);
}
public void testBZip2NoHeaderLine() throws Exception {
File file = new File(getWorkDir(), "one-line.bz2");
createBZ2LineFile(file,false);
doIndexAndSearchTest(file, true, "true", null, null);
}
public void testBZip2AutoDetect() throws Exception {
File file = new File(getWorkDir(), "one-line.bz2");
createBZ2LineFile(file);
doIndexAndSearchTest(file, false, null);
createBZ2LineFile(file,false);
doIndexAndSearchTest(file, false, null, null, null);
}
public void testRegularFile() throws Exception {
File file = new File(getWorkDir(), "one-line");
createRegularLineFile(file);
doIndexAndSearchTest(file, false, null);
createRegularLineFile(file,true);
doIndexAndSearchTest(file, false, null, null, null);
}
public void testRegularFileSpecialHeader() throws Exception {
File file = new File(getWorkDir(), "one-line");
createRegularLineFile(file,true);
doIndexAndSearchTest(file, false, null, HeaderLineParser.class, null);
}
public void testRegularFileNoHeaderLine() throws Exception {
File file = new File(getWorkDir(), "one-line");
createRegularLineFile(file,false);
doIndexAndSearchTest(file, false, null, null, null);
}
public void testInvalidFormat() throws Exception {
@ -134,7 +210,7 @@ public class LineDocSourceTest extends BenchmarkTestCase {
writer.newLine();
writer.close();
try {
doIndexAndSearchTest(file, false, null);
doIndexAndSearchTest(file, false, null, null, null);
fail("Some exception should have been thrown for: [" + testCases[i] + "]");
} catch (Exception e) {
// expected.
@ -142,4 +218,19 @@ public class LineDocSourceTest extends BenchmarkTestCase {
}
}
/** Doc Name is not part of the default header */
public void testWithDocsName() throws Exception {
File file = new File(getWorkDir(), "one-line");
createRegularLineFileWithMoreFields(file, DocMaker.NAME_FIELD);
doIndexAndSearchTest(file, false, null, null, DocMaker.NAME_FIELD);
}
/** Use fields names that are not defined in Docmaker and so will go to Properties */
public void testWithProperties() throws Exception {
File file = new File(getWorkDir(), "one-line");
String specialField = "mySpecialField";
createRegularLineFileWithMoreFields(file, specialField);
doIndexAndSearchTest(file, false, null, null, specialField);
}
}

View File

@ -98,6 +98,25 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
return doc;
}
}
// class has to be public so that Class.forName.newInstance() will work
// same as JustDate just that this one is treated as legal
public static final class LegalJustDateDocMaker extends DocMaker {
@Override
public Document makeDocument() throws Exception {
Document doc = new Document();
doc.add(new Field(DATE_FIELD, "date", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
return doc;
}
}
// class has to be public so that Class.forName.newInstance() will work
public static final class EmptyDocMaker extends DocMaker {
@Override
public Document makeDocument() throws Exception {
return new Document();
}
}
// class has to be public so that Class.forName.newInstance() will work
public static final class ThreadingDocMaker extends DocMaker {
@ -117,6 +136,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
private PerfRunData createPerfRunData(File file, boolean setBZCompress,
boolean allowEmptyDocs,
String bz2CompressVal,
String docMakerName) throws Exception {
Properties props = new Properties();
@ -126,6 +146,13 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
props.setProperty("bzip.compression", bz2CompressVal);
}
props.setProperty("directory", "RAMDirectory"); // no accidental FS dir.
if (allowEmptyDocs) {
props.setProperty("sufficient.fields", ",");
}
if (docMakerName.equals(LegalJustDateDocMaker.class.getName())) {
props.setProperty("line.fields", DocMaker.DATE_FIELD);
props.setProperty("sufficient.fields", DocMaker.DATE_FIELD);
}
Config config = new Config(props);
return new PerfRunData(config);
}
@ -139,6 +166,8 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
try {
String line = br.readLine();
assertHeaderLine(line);
line = br.readLine();
assertNotNull(line);
String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
int numExpParts = expBody == null ? 2 : 3;
@ -153,13 +182,17 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
br.close();
}
}
private void assertHeaderLine(String line) {
assertTrue("First line should be a header line",line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR));
}
/* Tests WriteLineDocTask with a bzip2 format. */
public void testBZip2() throws Exception {
// Create a document in bz2 format.
File file = new File(getWorkDir(), "one-line.bz2");
PerfRunData runData = createPerfRunData(file, true, "true", WriteLineDocMaker.class.getName());
PerfRunData runData = createPerfRunData(file, true, false, "true", WriteLineDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
@ -171,7 +204,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
// Create a document in bz2 format.
File file = new File(getWorkDir(), "one-line.bz2");
PerfRunData runData = createPerfRunData(file, false, null, WriteLineDocMaker.class.getName());
PerfRunData runData = createPerfRunData(file, false, false, null, WriteLineDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
@ -183,7 +216,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
// Create a document in regular format.
File file = new File(getWorkDir(), "one-line");
PerfRunData runData = createPerfRunData(file, true, "false", WriteLineDocMaker.class.getName());
PerfRunData runData = createPerfRunData(file, true, false, "false", WriteLineDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
@ -196,7 +229,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
// separator char. However, it didn't replace newline characters, which
// resulted in errors in LineDocSource.
File file = new File(getWorkDir(), "one-line");
PerfRunData runData = createPerfRunData(file, false, null, NewLinesDocMaker.class.getName());
PerfRunData runData = createPerfRunData(file, false, false, null, NewLinesDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
@ -209,7 +242,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
// had a TITLE element (LUCENE-1755). It should throw away documents if they
// don't have BODY nor TITLE
File file = new File(getWorkDir(), "one-line");
PerfRunData runData = createPerfRunData(file, false, null, NoBodyDocMaker.class.getName());
PerfRunData runData = createPerfRunData(file, false, false, null, NoBodyDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
@ -219,7 +252,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
public void testEmptyTitle() throws Exception {
File file = new File(getWorkDir(), "one-line");
PerfRunData runData = createPerfRunData(file, false, null, NoTitleDocMaker.class.getName());
PerfRunData runData = createPerfRunData(file, false, false, null, NoTitleDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
@ -227,9 +260,10 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
doReadTest(file, false, "", "date", "body");
}
/** Fail by default when there's only date */
public void testJustDate() throws Exception {
File file = new File(getWorkDir(), "one-line");
PerfRunData runData = createPerfRunData(file, false, null, JustDateDocMaker.class.getName());
PerfRunData runData = createPerfRunData(file, false, false, null, JustDateDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
@ -237,15 +271,53 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
try {
String line = br.readLine();
assertHeaderLine(line);
line = br.readLine();
assertNull(line);
} finally {
br.close();
}
}
public void testLegalJustDate() throws Exception {
File file = new File(getWorkDir(), "one-line");
PerfRunData runData = createPerfRunData(file, false, false, null, LegalJustDateDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
try {
String line = br.readLine();
assertHeaderLine(line);
line = br.readLine();
assertNotNull(line);
} finally {
br.close();
}
}
public void testEmptyDoc() throws Exception {
File file = new File(getWorkDir(), "one-line");
PerfRunData runData = createPerfRunData(file, false, true, null, EmptyDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
try {
String line = br.readLine();
assertHeaderLine(line);
line = br.readLine();
assertNotNull(line);
} finally {
br.close();
}
}
public void testMultiThreaded() throws Exception {
File file = new File(getWorkDir(), "one-line");
PerfRunData runData = createPerfRunData(file, false, null, ThreadingDocMaker.class.getName());
PerfRunData runData = createPerfRunData(file, false, false, null, ThreadingDocMaker.class.getName());
final WriteLineDocTask wldt = new WriteLineDocTask(runData);
Thread[] threads = new Thread[10];
for (int i = 0; i < threads.length; i++) {
@ -269,8 +341,10 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
Set<String> ids = new HashSet<String>();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
try {
String line = br.readLine();
assertHeaderLine(line); // header line is written once, no matter how many threads there are
for (int i = 0; i < threads.length; i++) {
String line = br.readLine();
line = br.readLine();
String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
assertEquals(3, parts.length);
// check that all thread names written are the same in the same line