diff --git a/contrib/benchmark/CHANGES.txt b/contrib/benchmark/CHANGES.txt
index bebbd350e57..f44992101a5 100644
--- a/contrib/benchmark/CHANGES.txt
+++ b/contrib/benchmark/CHANGES.txt
@@ -3,6 +3,11 @@ Lucene Benchmark Contrib Change Log
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
$Id:$
+6/23/09
+ LUCENE-1714: WriteLineDocTask incorrectly normalized text, by replacing only
+ occurrences of "\t" with a space. It now replaces "\r\n" in addition to that,
+ so that LineDocMaker won't fail. (Shai Erera via Michael McCandless)
+
6/17/09
LUCENE-1595: This issue breaks previous external algorithms. DocMaker has been
replaced with a concrete class which accepts a ContentSource for iterating over
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
index 70d64be470d..cb6a6c0d986 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
@@ -22,6 +22,8 @@ import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.lucene.benchmark.byTask.PerfRunData;
@@ -36,7 +38,6 @@ import org.apache.lucene.document.Field;
* taske can be consumed by
* {@link org.apache.lucene.benchmark.byTask.feeds.LineDocMaker} and is intended
* to save the IO overhead of opening a file per doument to be indexed.
- *
* Supports the following parameters:
*
* - line.file.out - the name of the file to write the output to. That
@@ -45,10 +46,14 @@ import org.apache.lucene.document.Field;
* recommended when the output file is expected to be large. (optional, default:
* false).
*
+ * NOTE: this class is not thread-safe and if used by multiple threads the
+ * output is unspecified (as all will write to the same ouput file in a
+ * non-synchronized way).
*/
public class WriteLineDocTask extends PerfTask {
public final static char SEP = '\t';
+ private static final Matcher NORMALIZER = Pattern.compile("[\t\r\n]+").matcher("");
private int docSize = 0;
private BufferedWriter lineFileOut = null;
@@ -92,14 +97,14 @@ public class WriteLineDocTask extends PerfTask {
Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument();
Field f = doc.getField(DocMaker.BODY_FIELD);
- String body = f != null ? f.stringValue().replace('\t', ' ') : null;
+ String body = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : null;
if (body != null) {
f = doc.getField(DocMaker.TITLE_FIELD);
- String title = f != null ? f.stringValue().replace('\t', ' ') : "";
+ String title = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : "";
f = doc.getField(DocMaker.DATE_FIELD);
- String date = f != null ? f.stringValue().replace('\t', ' ') : "";
+ String date = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : "";
lineFileOut.write(title, 0, title.length());
lineFileOut.write(SEP);
diff --git a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
index dfc7967a1e1..8c4f37f7d2a 100644
--- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
+++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
@@ -27,9 +27,7 @@ import java.util.Properties;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.PerfRunData;
-import org.apache.lucene.benchmark.byTask.feeds.DocData;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
-import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -42,10 +40,6 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
// class has to be public so that Class.forName.newInstance() will work
public static final class WriteLineDocMaker extends DocMaker {
- protected DocData getNextDocData() throws NoMoreDataException, Exception {
- throw new UnsupportedOperationException("not implemented");
- }
-
public Document makeDocument() throws Exception {
Document doc = new Document();
doc.add(new Field(BODY_FIELD, "body", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
@@ -54,17 +48,28 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
return doc;
}
- public int numUniqueTexts() {
- return 0;
+ }
+
+ // class has to be public so that Class.forName.newInstance() will work
+ public static final class NewLinesDocMaker extends DocMaker {
+
+ public Document makeDocument() throws Exception {
+ Document doc = new Document();
+ doc.add(new Field(BODY_FIELD, "body\r\ntext\ttwo", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+ doc.add(new Field(TITLE_FIELD, "title\r\ntext", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+ doc.add(new Field(DATE_FIELD, "date\r\ntext", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+ return doc;
}
}
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
- private PerfRunData createPerfRunData(File file, boolean setBZCompress, String bz2CompressVal) throws Exception {
+ private PerfRunData createPerfRunData(File file, boolean setBZCompress,
+ String bz2CompressVal,
+ String docMakerName) throws Exception {
Properties props = new Properties();
- props.setProperty("doc.maker", WriteLineDocMaker.class.getName());
+ props.setProperty("doc.maker", docMakerName);
props.setProperty("line.file.out", file.getAbsolutePath());
if (setBZCompress) {
props.setProperty("bzip.compression", bz2CompressVal);
@@ -74,7 +79,8 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
return new PerfRunData(config);
}
- private void doReadTest(File file, boolean bz2File) throws Exception {
+ private void doReadTest(File file, boolean bz2File, String expTitle,
+ String expDate, String expBody) throws Exception {
InputStream in = new FileInputStream(file);
if (bz2File) {
in = csFactory.createCompressorInputStream("bzip2", in);
@@ -85,9 +91,9 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
assertNotNull(line);
String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
assertEquals(3, parts.length);
- assertEquals("title", parts[0]);
- assertEquals("date", parts[1]);
- assertEquals("body", parts[2]);
+ assertEquals(expTitle, parts[0]);
+ assertEquals(expDate, parts[1]);
+ assertEquals(expBody, parts[2]);
assertNull(br.readLine());
} finally {
br.close();
@@ -99,36 +105,48 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
// Create a document in bz2 format.
File file = new File(getWorkDir(), "one-line.bz2");
- PerfRunData runData = createPerfRunData(file, true, "true");
+ PerfRunData runData = createPerfRunData(file, true, "true", WriteLineDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
- doReadTest(file, true);
+ doReadTest(file, true, "title", "date", "body");
}
public void testBZip2AutoDetect() throws Exception {
// Create a document in bz2 format.
File file = new File(getWorkDir(), "one-line.bz2");
- PerfRunData runData = createPerfRunData(file, false, null);
+ PerfRunData runData = createPerfRunData(file, false, null, WriteLineDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
- doReadTest(file, true);
+ doReadTest(file, true, "title", "date", "body");
}
public void testRegularFile() throws Exception {
// Create a document in regular format.
File file = new File(getWorkDir(), "one-line");
- PerfRunData runData = createPerfRunData(file, true, "false");
+ PerfRunData runData = createPerfRunData(file, true, "false", WriteLineDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
- doReadTest(file, false);
+ doReadTest(file, false, "title", "date", "body");
+ }
+
+ public void testCharsReplace() throws Exception {
+ // WriteLineDocTask replaced only \t characters w/ a space, since that's its
+ // separator char. However, it didn't replace newline characters, which
+ // resulted in errors in LineDocMaker.
+ File file = new File(getWorkDir(), "one-line");
+ PerfRunData runData = createPerfRunData(file, false, null, NewLinesDocMaker.class.getName());
+ WriteLineDocTask wldt = new WriteLineDocTask(runData);
+ wldt.doLogic();
+ wldt.close();
+
+ doReadTest(file, false, "title text", "date text", "body text two");
}
-
}