mirror of https://github.com/apache/lucene.git
LUCENE-2958: WriteLineDocTask improvements - flexible line fields definition - port/merge from 3x.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1083816 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
99c13a4e29
commit
a9fda446c3
|
@ -2,6 +2,13 @@ Lucene Benchmark Contrib Change Log
|
|||
|
||||
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
|
||||
|
||||
03/21/2011
|
||||
LUCENE-2958: WriteLineDocTask improvements - allow to emit line docs also for empty
|
||||
docs, and be flexible about which fields are added to the line file. For this, a header
|
||||
line was added to the line file. That header is examined by LineDocSource. Old line
|
||||
files which have no header line are handled as before, imposing the default header.
|
||||
(Doron Cohen, Shai Erera, Mike McCandless)
|
||||
|
||||
03/21/2011
|
||||
LUCENE-2964: Allow benchmark tasks from alternative packages,
|
||||
specified through a new property "alt.tasks.packages".
|
||||
|
|
|
@ -22,6 +22,9 @@ import java.io.File;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.util.Arrays;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
|
@ -40,16 +43,136 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
|
|||
* <ul>
|
||||
* <li>docs.file=<path to the file>
|
||||
* <li>content.source.encoding - default to UTF-8.
|
||||
* <li>line.parser - default to {@link HeaderLineParser} if a header line exists which differs
|
||||
* from {@link WriteLineDocTask#DEFAULT_FIELDS} and to {@link SimpleLineParser} otherwise.
|
||||
* </ul>
|
||||
*/
|
||||
public class LineDocSource extends ContentSource {
|
||||
|
||||
private final static char SEP = WriteLineDocTask.SEP;
|
||||
/** Reader of a single input line into {@link DocData}. */
|
||||
public static abstract class LineParser {
|
||||
protected final String[] header;
|
||||
/** Construct with the header
|
||||
* @param header header line found in the input file, or null if none
|
||||
*/
|
||||
public LineParser(String[] header) {
|
||||
this.header = header;
|
||||
}
|
||||
/** parse an input line and fill doc data appropriately */
|
||||
public abstract void parseLine(DocData docData, String line);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@link LineParser} which ignores the header passed to its constructor
|
||||
* and assumes simply that field names and their order are the same
|
||||
* as in {@link WriteLineDocTask#DEFAULT_FIELDS}
|
||||
*/
|
||||
public static class SimpleLineParser extends LineParser {
|
||||
public SimpleLineParser(String[] header) {
|
||||
super(header);
|
||||
}
|
||||
public void parseLine(DocData docData, String line) {
|
||||
int k1 = 0;
|
||||
int k2 = line.indexOf(WriteLineDocTask.SEP, k1);
|
||||
if (k2<0) {
|
||||
throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator title::date)!");
|
||||
}
|
||||
docData.setTitle(line.substring(k1,k2));
|
||||
k1 = k2+1;
|
||||
k2 = line.indexOf(WriteLineDocTask.SEP, k1);
|
||||
if (k2<0) {
|
||||
throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator date::body)!");
|
||||
}
|
||||
docData.setDate(line.substring(k1,k2));
|
||||
k1 = k2+1;
|
||||
k2 = line.indexOf(WriteLineDocTask.SEP, k1);
|
||||
if (k2>=0) {
|
||||
throw new RuntimeException("line: [" + line + "] is in an invalid format (too many separators)!");
|
||||
}
|
||||
// last one
|
||||
docData.setBody(line.substring(k1));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* {@link LineParser} which sets field names and order by
|
||||
* the header - any header - of the lines file.
|
||||
* It is less efficient than {@link SimpleLineParser} but more powerful.
|
||||
*/
|
||||
public static class HeaderLineParser extends LineParser {
|
||||
private enum FieldName { NAME , TITLE , DATE , BODY, PROP }
|
||||
private final FieldName[] posToF;
|
||||
public HeaderLineParser(String[] header) {
|
||||
super(header);
|
||||
posToF = new FieldName[header.length];
|
||||
for (int i=0; i<header.length; i++) {
|
||||
String f = header[i];
|
||||
if (DocMaker.NAME_FIELD.equals(f)) {
|
||||
posToF[i] = FieldName.NAME;
|
||||
} else if (DocMaker.TITLE_FIELD.equals(f)) {
|
||||
posToF[i] = FieldName.TITLE;
|
||||
} else if (DocMaker.DATE_FIELD.equals(f)) {
|
||||
posToF[i] = FieldName.DATE;
|
||||
} else if (DocMaker.BODY_FIELD.equals(f)) {
|
||||
posToF[i] = FieldName.BODY;
|
||||
} else {
|
||||
posToF[i] = FieldName.PROP;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void parseLine(DocData docData, String line) {
|
||||
int n = 0;
|
||||
int k1 = 0;
|
||||
int k2;
|
||||
while ((k2 = line.indexOf(WriteLineDocTask.SEP, k1)) >= 0) {
|
||||
if (n>=header.length) {
|
||||
throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
|
||||
}
|
||||
setDocDataField(docData, n, line.substring(k1,k2));
|
||||
++n;
|
||||
k1 = k2 + 1;
|
||||
}
|
||||
if (n!=header.length-1) {
|
||||
throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
|
||||
}
|
||||
// last one
|
||||
setDocDataField(docData, n, line.substring(k1));
|
||||
}
|
||||
|
||||
private void setDocDataField(DocData docData, int position, String text) {
|
||||
switch(posToF[position]) {
|
||||
case NAME:
|
||||
docData.setName(text);
|
||||
break;
|
||||
case TITLE:
|
||||
docData.setTitle(text);
|
||||
break;
|
||||
case DATE:
|
||||
docData.setDate(text);
|
||||
break;
|
||||
case BODY:
|
||||
docData.setBody(text);
|
||||
break;
|
||||
case PROP:
|
||||
Properties p = docData.getProps();
|
||||
if (p==null) {
|
||||
p = new Properties();
|
||||
docData.setProps(p);
|
||||
}
|
||||
p.setProperty(header[position], text);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private File file;
|
||||
private BufferedReader reader;
|
||||
private int readCount;
|
||||
|
||||
private LineParser docDataLineReader = null;
|
||||
private boolean skipHeaderLine = false;
|
||||
|
||||
private synchronized void openFile() {
|
||||
try {
|
||||
if (reader != null) {
|
||||
|
@ -57,6 +180,9 @@ public class LineDocSource extends ContentSource {
|
|||
}
|
||||
InputStream is = getInputStream(file);
|
||||
reader = new BufferedReader(new InputStreamReader(is, encoding), BUFFER_SIZE);
|
||||
if (skipHeaderLine) {
|
||||
reader.readLine(); // skip one line - the header line - already handled that info
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
@ -77,7 +203,6 @@ public class LineDocSource extends ContentSource {
|
|||
|
||||
synchronized(this) {
|
||||
line = reader.readLine();
|
||||
myID = readCount++;
|
||||
if (line == null) {
|
||||
if (!forever) {
|
||||
throw new NoMoreDataException();
|
||||
|
@ -86,27 +211,54 @@ public class LineDocSource extends ContentSource {
|
|||
openFile();
|
||||
return getNextDocData(docData);
|
||||
}
|
||||
if (docDataLineReader == null) { // first line ever, one time initialization,
|
||||
docDataLineReader = createDocDataLineReader(line);
|
||||
if (skipHeaderLine) {
|
||||
return getNextDocData(docData);
|
||||
}
|
||||
}
|
||||
// increment IDS only once...
|
||||
myID = readCount++;
|
||||
}
|
||||
|
||||
// A line must be in the following format. If it's not, fail !
|
||||
// title <TAB> date <TAB> body <NEWLINE>
|
||||
int spot = line.indexOf(SEP);
|
||||
if (spot == -1) {
|
||||
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
|
||||
}
|
||||
int spot2 = line.indexOf(SEP, 1 + spot);
|
||||
if (spot2 == -1) {
|
||||
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
|
||||
}
|
||||
// The date String was written in the format of DateTools.dateToString.
|
||||
docData.clear();
|
||||
docData.setID(myID);
|
||||
docData.setBody(line.substring(1 + spot2, line.length()));
|
||||
docData.setTitle(line.substring(0, spot));
|
||||
docData.setDate(line.substring(1 + spot, spot2));
|
||||
docDataLineReader.parseLine(docData, line);
|
||||
return docData;
|
||||
}
|
||||
|
||||
private LineParser createDocDataLineReader(String line) {
|
||||
String[] header;
|
||||
String headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP;
|
||||
|
||||
if (line.startsWith(headIndicator)) {
|
||||
header = line.substring(headIndicator.length()).split(Character.toString(WriteLineDocTask.SEP));
|
||||
skipHeaderLine = true; // mark to skip the header line when input file is reopened
|
||||
} else {
|
||||
header = WriteLineDocTask.DEFAULT_FIELDS;
|
||||
}
|
||||
|
||||
// if a specific DocDataLineReader was configured, must respect it
|
||||
String docDataLineReaderClassName = getConfig().get("line.parser", null);
|
||||
if (docDataLineReaderClassName!=null) {
|
||||
try {
|
||||
final Class<? extends LineParser> clazz =
|
||||
Class.forName(docDataLineReaderClassName).asSubclass(LineParser.class);
|
||||
Constructor<? extends LineParser> cnstr = clazz.getConstructor(new Class[]{String[].class});
|
||||
return cnstr.newInstance((Object)header);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to instantiate "+docDataLineReaderClassName, e);
|
||||
}
|
||||
}
|
||||
|
||||
// if this the simple case,
|
||||
if (Arrays.deepEquals(header, WriteLineDocTask.DEFAULT_FIELDS)) {
|
||||
return new SimpleLineParser(header);
|
||||
}
|
||||
return new HeaderLineParser(header);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resetInputs() throws IOException {
|
||||
super.resetInputs();
|
||||
|
|
|
@ -23,6 +23,8 @@ import java.io.FileOutputStream;
|
|||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
@ -41,11 +43,17 @@ import org.apache.lucene.document.Field;
|
|||
* to save the IO overhead of opening a file per document to be indexed.<br>
|
||||
* Supports the following parameters:
|
||||
* <ul>
|
||||
* <li>line.file.out - the name of the file to write the output to. That
|
||||
* <li><b>line.file.out<b> - the name of the file to write the output to. That
|
||||
* parameter is mandatory. <b>NOTE:</b> the file is re-created.
|
||||
* <li>bzip.compression - whether the output should be bzip-compressed. This is
|
||||
* recommended when the output file is expected to be large. (optional, default:
|
||||
* false).
|
||||
* <li><b>bzip.compression<b> - whether the output should be bzip-compressed. This is
|
||||
* recommended when the output file is expected to be large.
|
||||
* <li><b>line.fields<b> - which fields should be written in each line.
|
||||
* (optional, default: {@link #DEFAULT_FIELDS}).
|
||||
* <li><b>sufficient.fields</b> - list of field names, separated by comma, which,
|
||||
* if all of them are missing, the document will be skipped. For example, to require
|
||||
* that at least one of f1,f2 is not empty, specify: "f1,f2" in this field. To specify
|
||||
* that no field is required, i.e. that even empty docs should be emitted, specify <b>","</b>.
|
||||
* (optional, default: {@link #DEFAULT_SUFFICIENT_FIELDS}).
|
||||
* </ul>
|
||||
* <b>NOTE:</b> this class is not thread-safe and if used by multiple threads the
|
||||
* output is unspecified (as all will write to the same output file in a
|
||||
|
@ -53,13 +61,32 @@ import org.apache.lucene.document.Field;
|
|||
*/
|
||||
public class WriteLineDocTask extends PerfTask {
|
||||
|
||||
public static final String FIELDS_HEADER_INDICATOR = "FIELDS_HEADER_INDICATOR###";
|
||||
|
||||
public final static char SEP = '\t';
|
||||
|
||||
/**
|
||||
* Fields to be written by default
|
||||
*/
|
||||
public static final String[] DEFAULT_FIELDS = new String[] {
|
||||
DocMaker.TITLE_FIELD,
|
||||
DocMaker.DATE_FIELD,
|
||||
DocMaker.BODY_FIELD,
|
||||
};
|
||||
|
||||
/**
|
||||
* Default fields which at least one of them is required to not skip the doc.
|
||||
*/
|
||||
public static final String DEFAULT_SUFFICIENT_FIELDS = DocMaker.TITLE_FIELD +',' + DocMaker.BODY_FIELD;
|
||||
|
||||
private int docSize = 0;
|
||||
private PrintWriter lineFileOut = null;
|
||||
private DocMaker docMaker;
|
||||
private ThreadLocal<StringBuilder> threadBuffer = new ThreadLocal<StringBuilder>();
|
||||
private ThreadLocal<Matcher> threadNormalizer = new ThreadLocal<Matcher>();
|
||||
private final String[] fieldsToWrite;;
|
||||
private final boolean[] sufficientFields;
|
||||
private final boolean checkSufficientFields;
|
||||
|
||||
public WriteLineDocTask(PerfRunData runData) throws Exception {
|
||||
super(runData);
|
||||
|
@ -89,6 +116,51 @@ public class WriteLineDocTask extends PerfTask {
|
|||
}
|
||||
lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16));
|
||||
docMaker = runData.getDocMaker();
|
||||
|
||||
// init fields
|
||||
String f2r = config.get("line.fields",null);
|
||||
if (f2r == null) {
|
||||
fieldsToWrite = DEFAULT_FIELDS;
|
||||
} else {
|
||||
if (f2r.indexOf(SEP)>=0) {
|
||||
throw new IllegalArgumentException("line.fields "+f2r+" should not contain the separator char: "+SEP);
|
||||
}
|
||||
fieldsToWrite = f2r.split(",");
|
||||
}
|
||||
|
||||
// init sufficient fields
|
||||
sufficientFields = new boolean[fieldsToWrite.length];
|
||||
String suff = config.get("sufficient.fields",DEFAULT_SUFFICIENT_FIELDS);
|
||||
if (",".equals(suff)) {
|
||||
checkSufficientFields = false;
|
||||
} else {
|
||||
checkSufficientFields = true;
|
||||
HashSet<String> sf = new HashSet<String>(Arrays.asList(suff.split(",")));
|
||||
for (int i=0; i<fieldsToWrite.length; i++) {
|
||||
if (sf.contains(fieldsToWrite[i])) {
|
||||
sufficientFields[i] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
writeHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a header to the lines file - indicating how to read the file later
|
||||
*/
|
||||
private void writeHeader() {
|
||||
StringBuilder sb = threadBuffer.get();
|
||||
if (sb == null) {
|
||||
sb = new StringBuilder();
|
||||
threadBuffer.set(sb);
|
||||
}
|
||||
sb.setLength(0);
|
||||
sb.append(FIELDS_HEADER_INDICATOR);
|
||||
for (String f : fieldsToWrite) {
|
||||
sb.append(SEP).append(f);
|
||||
}
|
||||
lineFileOut.println(sb.toString());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -106,27 +178,26 @@ public class WriteLineDocTask extends PerfTask {
|
|||
threadNormalizer.set(matcher);
|
||||
}
|
||||
|
||||
Field f = doc.getField(DocMaker.BODY_FIELD);
|
||||
String body = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : "";
|
||||
|
||||
f = doc.getField(DocMaker.TITLE_FIELD);
|
||||
String title = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : "";
|
||||
|
||||
if (body.length() > 0 || title.length() > 0) {
|
||||
|
||||
f = doc.getField(DocMaker.DATE_FIELD);
|
||||
String date = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : "";
|
||||
|
||||
StringBuilder sb = threadBuffer.get();
|
||||
if (sb == null) {
|
||||
sb = new StringBuilder();
|
||||
threadBuffer.set(sb);
|
||||
}
|
||||
sb.setLength(0);
|
||||
sb.append(title).append(SEP).append(date).append(SEP).append(body);
|
||||
StringBuilder sb = threadBuffer.get();
|
||||
if (sb == null) {
|
||||
sb = new StringBuilder();
|
||||
threadBuffer.set(sb);
|
||||
}
|
||||
sb.setLength(0);
|
||||
|
||||
boolean sufficient = !checkSufficientFields;
|
||||
for (int i=0; i<fieldsToWrite.length; i++) {
|
||||
Field f = doc.getField(fieldsToWrite[i]);
|
||||
String text = f == null ? "" : matcher.reset(f.stringValue()).replaceAll(" ").trim();
|
||||
sb.append(text).append(SEP);
|
||||
sufficient |= text.length()>0 && sufficientFields[i];
|
||||
}
|
||||
if (sufficient) {
|
||||
sb.setLength(sb.length()-1); // remove redundant last separator
|
||||
// lineFileOut is a PrintWriter, which synchronizes internally in println.
|
||||
lineFileOut.println(sb.toString());
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -36,6 +36,7 @@ import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
|
|||
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
|
||||
import org.apache.lucene.collation.CollationKeyAnalyzer;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
|
@ -393,8 +394,13 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
|
|||
|
||||
BufferedReader r = new BufferedReader(new FileReader(lineFile));
|
||||
int numLines = 0;
|
||||
while(r.readLine() != null)
|
||||
String line;
|
||||
while((line = r.readLine()) != null) {
|
||||
if (numLines==0 && line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR)) {
|
||||
continue; // do not count the header line as a doc
|
||||
}
|
||||
numLines++;
|
||||
}
|
||||
r.close();
|
||||
assertEquals("did not see the right number of docs; should be " + NUM_TRY_DOCS + " but was " + numLines, NUM_TRY_DOCS, numLines);
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
|||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.util.Properties;
|
||||
|
@ -28,6 +29,8 @@ import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
|||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.HeaderLineParser;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.LineParser;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.AddDocTask;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask;
|
||||
|
@ -44,29 +47,71 @@ public class LineDocSourceTest extends BenchmarkTestCase {
|
|||
|
||||
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||
|
||||
private void createBZ2LineFile(File file) throws Exception {
|
||||
private void createBZ2LineFile(File file, boolean addHeader) throws Exception {
|
||||
OutputStream out = new FileOutputStream(file);
|
||||
out = csFactory.createCompressorOutputStream("bzip2", out);
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
|
||||
StringBuilder doc = new StringBuilder();
|
||||
doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
|
||||
writer.write(doc.toString());
|
||||
writer.newLine();
|
||||
writeDocsToFile(writer, addHeader, null);
|
||||
writer.close();
|
||||
}
|
||||
|
||||
private void createRegularLineFile(File file) throws Exception {
|
||||
OutputStream out = new FileOutputStream(file);
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
|
||||
private void writeDocsToFile(BufferedWriter writer, boolean addHeader, Properties otherFields) throws IOException {
|
||||
if (addHeader) {
|
||||
writer.write(WriteLineDocTask.FIELDS_HEADER_INDICATOR);
|
||||
writer.write(WriteLineDocTask.SEP);
|
||||
writer.write(DocMaker.TITLE_FIELD);
|
||||
writer.write(WriteLineDocTask.SEP);
|
||||
writer.write(DocMaker.DATE_FIELD);
|
||||
writer.write(WriteLineDocTask.SEP);
|
||||
writer.write(DocMaker.BODY_FIELD);
|
||||
if (otherFields!=null) {
|
||||
// additional field names in the header
|
||||
for (Object fn : otherFields.keySet()) {
|
||||
writer.write(WriteLineDocTask.SEP);
|
||||
writer.write(fn.toString());
|
||||
}
|
||||
}
|
||||
writer.newLine();
|
||||
}
|
||||
StringBuilder doc = new StringBuilder();
|
||||
doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
|
||||
doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append(DocMaker.BODY_FIELD);
|
||||
if (otherFields!=null) {
|
||||
// additional field values in the doc line
|
||||
for (Object fv : otherFields.values()) {
|
||||
doc.append(WriteLineDocTask.SEP).append(fv.toString());
|
||||
}
|
||||
}
|
||||
writer.write(doc.toString());
|
||||
writer.newLine();
|
||||
}
|
||||
|
||||
private void createRegularLineFile(File file, boolean addHeader) throws Exception {
|
||||
OutputStream out = new FileOutputStream(file);
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
|
||||
writeDocsToFile(writer, addHeader, null);
|
||||
writer.close();
|
||||
}
|
||||
|
||||
private void createRegularLineFileWithMoreFields(File file, String...extraFields) throws Exception {
|
||||
OutputStream out = new FileOutputStream(file);
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
|
||||
Properties p = new Properties();
|
||||
for (String f : extraFields) {
|
||||
p.setProperty(f, f);
|
||||
}
|
||||
writeDocsToFile(writer, true, p);
|
||||
writer.close();
|
||||
}
|
||||
|
||||
private void doIndexAndSearchTest(File file, boolean setBZCompress,
|
||||
String bz2CompressVal) throws Exception {
|
||||
String bz2CompressVal, Class<? extends LineParser> lineParserClass, String storedField) throws Exception {
|
||||
doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 1, storedField); // no extra repetitions
|
||||
doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 2, storedField); // 1 extra repetition
|
||||
doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 4, storedField); // 3 extra repetitions
|
||||
}
|
||||
|
||||
private void doIndexAndSearchTestWithRepeats(File file, boolean setBZCompress,
|
||||
String bz2CompressVal, Class<? extends LineParser> lineParserClass, int numAdds, String storedField) throws Exception {
|
||||
|
||||
Properties props = new Properties();
|
||||
|
||||
|
@ -75,11 +120,16 @@ public class LineDocSourceTest extends BenchmarkTestCase {
|
|||
if (setBZCompress) {
|
||||
props.setProperty("bzip.compression", bz2CompressVal);
|
||||
}
|
||||
if (lineParserClass != null) {
|
||||
props.setProperty("line.parser", lineParserClass.getName());
|
||||
}
|
||||
|
||||
// Indexing configuration.
|
||||
props.setProperty("analyzer", MockAnalyzer.class.getName());
|
||||
props.setProperty("content.source", LineDocSource.class.getName());
|
||||
props.setProperty("directory", "RAMDirectory");
|
||||
props.setProperty("doc.stored", "true");
|
||||
props.setProperty("doc.index.props", "true");
|
||||
|
||||
// Create PerfRunData
|
||||
Config config = new Config(props);
|
||||
|
@ -87,34 +137,60 @@ public class LineDocSourceTest extends BenchmarkTestCase {
|
|||
|
||||
TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false);
|
||||
tasks.addTask(new CreateIndexTask(runData));
|
||||
tasks.addTask(new AddDocTask(runData));
|
||||
for (int i=0; i<numAdds; i++) {
|
||||
tasks.addTask(new AddDocTask(runData));
|
||||
}
|
||||
tasks.addTask(new CloseIndexTask(runData));
|
||||
tasks.doLogic();
|
||||
|
||||
IndexSearcher searcher = new IndexSearcher(runData.getDirectory(), true);
|
||||
TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10);
|
||||
assertEquals(1, td.totalHits);
|
||||
assertEquals(numAdds, td.totalHits);
|
||||
assertNotNull(td.scoreDocs[0]);
|
||||
|
||||
if (storedField==null) {
|
||||
storedField = DocMaker.BODY_FIELD; // added to all docs and satisfies field-name == value
|
||||
}
|
||||
assertEquals("Wrong field value", storedField, searcher.doc(0).get(storedField));
|
||||
|
||||
searcher.close();
|
||||
}
|
||||
|
||||
/* Tests LineDocSource with a bzip2 input stream. */
|
||||
public void testBZip2() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
createBZ2LineFile(file);
|
||||
doIndexAndSearchTest(file, true, "true");
|
||||
createBZ2LineFile(file,true);
|
||||
doIndexAndSearchTest(file, true, "true", null, null);
|
||||
}
|
||||
|
||||
public void testBZip2NoHeaderLine() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
createBZ2LineFile(file,false);
|
||||
doIndexAndSearchTest(file, true, "true", null, null);
|
||||
}
|
||||
|
||||
public void testBZip2AutoDetect() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
createBZ2LineFile(file);
|
||||
doIndexAndSearchTest(file, false, null);
|
||||
createBZ2LineFile(file,false);
|
||||
doIndexAndSearchTest(file, false, null, null, null);
|
||||
}
|
||||
|
||||
public void testRegularFile() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
createRegularLineFile(file);
|
||||
doIndexAndSearchTest(file, false, null);
|
||||
createRegularLineFile(file,true);
|
||||
doIndexAndSearchTest(file, false, null, null, null);
|
||||
}
|
||||
|
||||
public void testRegularFileSpecialHeader() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
createRegularLineFile(file,true);
|
||||
doIndexAndSearchTest(file, false, null, HeaderLineParser.class, null);
|
||||
}
|
||||
|
||||
public void testRegularFileNoHeaderLine() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
createRegularLineFile(file,false);
|
||||
doIndexAndSearchTest(file, false, null, null, null);
|
||||
}
|
||||
|
||||
public void testInvalidFormat() throws Exception {
|
||||
|
@ -134,7 +210,7 @@ public class LineDocSourceTest extends BenchmarkTestCase {
|
|||
writer.newLine();
|
||||
writer.close();
|
||||
try {
|
||||
doIndexAndSearchTest(file, false, null);
|
||||
doIndexAndSearchTest(file, false, null, null, null);
|
||||
fail("Some exception should have been thrown for: [" + testCases[i] + "]");
|
||||
} catch (Exception e) {
|
||||
// expected.
|
||||
|
@ -142,4 +218,19 @@ public class LineDocSourceTest extends BenchmarkTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
/** Doc Name is not part of the default header */
|
||||
public void testWithDocsName() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
createRegularLineFileWithMoreFields(file, DocMaker.NAME_FIELD);
|
||||
doIndexAndSearchTest(file, false, null, null, DocMaker.NAME_FIELD);
|
||||
}
|
||||
|
||||
/** Use fields names that are not defined in Docmaker and so will go to Properties */
|
||||
public void testWithProperties() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
String specialField = "mySpecialField";
|
||||
createRegularLineFileWithMoreFields(file, specialField);
|
||||
doIndexAndSearchTest(file, false, null, null, specialField);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -98,6 +98,25 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
return doc;
|
||||
}
|
||||
}
|
||||
|
||||
// class has to be public so that Class.forName.newInstance() will work
|
||||
// same as JustDate just that this one is treated as legal
|
||||
public static final class LegalJustDateDocMaker extends DocMaker {
|
||||
@Override
|
||||
public Document makeDocument() throws Exception {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field(DATE_FIELD, "date", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
|
||||
return doc;
|
||||
}
|
||||
}
|
||||
|
||||
// class has to be public so that Class.forName.newInstance() will work
|
||||
public static final class EmptyDocMaker extends DocMaker {
|
||||
@Override
|
||||
public Document makeDocument() throws Exception {
|
||||
return new Document();
|
||||
}
|
||||
}
|
||||
|
||||
// class has to be public so that Class.forName.newInstance() will work
|
||||
public static final class ThreadingDocMaker extends DocMaker {
|
||||
|
@ -117,6 +136,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||
|
||||
private PerfRunData createPerfRunData(File file, boolean setBZCompress,
|
||||
boolean allowEmptyDocs,
|
||||
String bz2CompressVal,
|
||||
String docMakerName) throws Exception {
|
||||
Properties props = new Properties();
|
||||
|
@ -126,6 +146,13 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
props.setProperty("bzip.compression", bz2CompressVal);
|
||||
}
|
||||
props.setProperty("directory", "RAMDirectory"); // no accidental FS dir.
|
||||
if (allowEmptyDocs) {
|
||||
props.setProperty("sufficient.fields", ",");
|
||||
}
|
||||
if (docMakerName.equals(LegalJustDateDocMaker.class.getName())) {
|
||||
props.setProperty("line.fields", DocMaker.DATE_FIELD);
|
||||
props.setProperty("sufficient.fields", DocMaker.DATE_FIELD);
|
||||
}
|
||||
Config config = new Config(props);
|
||||
return new PerfRunData(config);
|
||||
}
|
||||
|
@ -139,6 +166,8 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
|
||||
try {
|
||||
String line = br.readLine();
|
||||
assertHeaderLine(line);
|
||||
line = br.readLine();
|
||||
assertNotNull(line);
|
||||
String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
|
||||
int numExpParts = expBody == null ? 2 : 3;
|
||||
|
@ -153,13 +182,17 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
br.close();
|
||||
}
|
||||
}
|
||||
|
||||
private void assertHeaderLine(String line) {
|
||||
assertTrue("First line should be a header line",line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR));
|
||||
}
|
||||
|
||||
/* Tests WriteLineDocTask with a bzip2 format. */
|
||||
public void testBZip2() throws Exception {
|
||||
|
||||
// Create a document in bz2 format.
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
PerfRunData runData = createPerfRunData(file, true, "true", WriteLineDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, true, false, "true", WriteLineDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
@ -171,7 +204,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
|
||||
// Create a document in bz2 format.
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
PerfRunData runData = createPerfRunData(file, false, null, WriteLineDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, false, false, null, WriteLineDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
@ -183,7 +216,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
|
||||
// Create a document in regular format.
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, true, "false", WriteLineDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, true, false, "false", WriteLineDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
@ -196,7 +229,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
// separator char. However, it didn't replace newline characters, which
|
||||
// resulted in errors in LineDocSource.
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, false, null, NewLinesDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, false, false, null, NewLinesDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
@ -209,7 +242,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
// had a TITLE element (LUCENE-1755). It should throw away documents if they
|
||||
// don't have BODY nor TITLE
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, false, null, NoBodyDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, false, false, null, NoBodyDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
@ -219,7 +252,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
|
||||
public void testEmptyTitle() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, false, null, NoTitleDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, false, false, null, NoTitleDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
@ -227,9 +260,10 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
doReadTest(file, false, "", "date", "body");
|
||||
}
|
||||
|
||||
/** Fail by default when there's only date */
|
||||
public void testJustDate() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, false, null, JustDateDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, false, false, null, JustDateDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
@ -237,15 +271,53 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
|
||||
try {
|
||||
String line = br.readLine();
|
||||
assertHeaderLine(line);
|
||||
line = br.readLine();
|
||||
assertNull(line);
|
||||
} finally {
|
||||
br.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testLegalJustDate() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, false, false, null, LegalJustDateDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
|
||||
try {
|
||||
String line = br.readLine();
|
||||
assertHeaderLine(line);
|
||||
line = br.readLine();
|
||||
assertNotNull(line);
|
||||
} finally {
|
||||
br.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testEmptyDoc() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, false, true, null, EmptyDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
|
||||
try {
|
||||
String line = br.readLine();
|
||||
assertHeaderLine(line);
|
||||
line = br.readLine();
|
||||
assertNotNull(line);
|
||||
} finally {
|
||||
br.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testMultiThreaded() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, false, null, ThreadingDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, false, false, null, ThreadingDocMaker.class.getName());
|
||||
final WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
Thread[] threads = new Thread[10];
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
|
@ -269,8 +341,10 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
Set<String> ids = new HashSet<String>();
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
|
||||
try {
|
||||
String line = br.readLine();
|
||||
assertHeaderLine(line); // header line is written once, no matter how many threads there are
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
String line = br.readLine();
|
||||
line = br.readLine();
|
||||
String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
|
||||
assertEquals(3, parts.length);
|
||||
// check that all thread names written are the same in the same line
|
||||
|
|
Loading…
Reference in New Issue