mirror of https://github.com/apache/lucene.git
LUCENE-1714: fix WriteLineDocTask to also replace \r, \n (in addition to \t) with space so those chars don't create mal-formed lines
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@787750 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
24d827fd70
commit
5f72065d0f
|
@ -3,6 +3,11 @@ Lucene Benchmark Contrib Change Log
|
||||||
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
|
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
|
||||||
|
|
||||||
$Id:$
|
$Id:$
|
||||||
|
6/23/09
|
||||||
|
LUCENE-1714: WriteLineDocTask incorrectly normalized text, by replacing only
|
||||||
|
occurrences of "\t" with a space. It now replaces "\r\n" in addition to that,
|
||||||
|
so that LineDocMaker won't fail. (Shai Erera via Michael McCandless)
|
||||||
|
|
||||||
6/17/09
|
6/17/09
|
||||||
LUCENE-1595: This issue breaks previous external algorithms. DocMaker has been
|
LUCENE-1595: This issue breaks previous external algorithms. DocMaker has been
|
||||||
replaced with a concrete class which accepts a ContentSource for iterating over
|
replaced with a concrete class which accepts a ContentSource for iterating over
|
||||||
|
|
|
@ -22,6 +22,8 @@ import java.io.BufferedWriter;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.io.OutputStreamWriter;
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
|
@ -36,7 +38,6 @@ import org.apache.lucene.document.Field;
|
||||||
* taske can be consumed by
|
* taske can be consumed by
|
||||||
* {@link org.apache.lucene.benchmark.byTask.feeds.LineDocMaker} and is intended
|
* {@link org.apache.lucene.benchmark.byTask.feeds.LineDocMaker} and is intended
|
||||||
* to save the IO overhead of opening a file per doument to be indexed.<br>
|
* to save the IO overhead of opening a file per doument to be indexed.<br>
|
||||||
*
|
|
||||||
* Supports the following parameters:
|
* Supports the following parameters:
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>line.file.out - the name of the file to write the output to. That
|
* <li>line.file.out - the name of the file to write the output to. That
|
||||||
|
@ -45,10 +46,14 @@ import org.apache.lucene.document.Field;
|
||||||
* recommended when the output file is expected to be large. (optional, default:
|
* recommended when the output file is expected to be large. (optional, default:
|
||||||
* false).
|
* false).
|
||||||
* </ul>
|
* </ul>
|
||||||
|
* <b>NOTE:</b> this class is not thread-safe and if used by multiple threads the
|
||||||
|
* output is unspecified (as all will write to the same ouput file in a
|
||||||
|
* non-synchronized way).
|
||||||
*/
|
*/
|
||||||
public class WriteLineDocTask extends PerfTask {
|
public class WriteLineDocTask extends PerfTask {
|
||||||
|
|
||||||
public final static char SEP = '\t';
|
public final static char SEP = '\t';
|
||||||
|
private static final Matcher NORMALIZER = Pattern.compile("[\t\r\n]+").matcher("");
|
||||||
|
|
||||||
private int docSize = 0;
|
private int docSize = 0;
|
||||||
private BufferedWriter lineFileOut = null;
|
private BufferedWriter lineFileOut = null;
|
||||||
|
@ -92,14 +97,14 @@ public class WriteLineDocTask extends PerfTask {
|
||||||
Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument();
|
Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument();
|
||||||
|
|
||||||
Field f = doc.getField(DocMaker.BODY_FIELD);
|
Field f = doc.getField(DocMaker.BODY_FIELD);
|
||||||
String body = f != null ? f.stringValue().replace('\t', ' ') : null;
|
String body = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : null;
|
||||||
|
|
||||||
if (body != null) {
|
if (body != null) {
|
||||||
f = doc.getField(DocMaker.TITLE_FIELD);
|
f = doc.getField(DocMaker.TITLE_FIELD);
|
||||||
String title = f != null ? f.stringValue().replace('\t', ' ') : "";
|
String title = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : "";
|
||||||
|
|
||||||
f = doc.getField(DocMaker.DATE_FIELD);
|
f = doc.getField(DocMaker.DATE_FIELD);
|
||||||
String date = f != null ? f.stringValue().replace('\t', ' ') : "";
|
String date = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : "";
|
||||||
|
|
||||||
lineFileOut.write(title, 0, title.length());
|
lineFileOut.write(title, 0, title.length());
|
||||||
lineFileOut.write(SEP);
|
lineFileOut.write(SEP);
|
||||||
|
|
|
@ -27,9 +27,7 @@ import java.util.Properties;
|
||||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||||
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
import org.apache.lucene.benchmark.byTask.feeds.DocData;
|
|
||||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||||
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
@ -42,10 +40,6 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
||||||
// class has to be public so that Class.forName.newInstance() will work
|
// class has to be public so that Class.forName.newInstance() will work
|
||||||
public static final class WriteLineDocMaker extends DocMaker {
|
public static final class WriteLineDocMaker extends DocMaker {
|
||||||
|
|
||||||
protected DocData getNextDocData() throws NoMoreDataException, Exception {
|
|
||||||
throw new UnsupportedOperationException("not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
public Document makeDocument() throws Exception {
|
public Document makeDocument() throws Exception {
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
doc.add(new Field(BODY_FIELD, "body", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
|
doc.add(new Field(BODY_FIELD, "body", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
|
||||||
|
@ -54,17 +48,28 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
||||||
return doc;
|
return doc;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int numUniqueTexts() {
|
}
|
||||||
return 0;
|
|
||||||
|
// class has to be public so that Class.forName.newInstance() will work
|
||||||
|
public static final class NewLinesDocMaker extends DocMaker {
|
||||||
|
|
||||||
|
public Document makeDocument() throws Exception {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field(BODY_FIELD, "body\r\ntext\ttwo", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
|
||||||
|
doc.add(new Field(TITLE_FIELD, "title\r\ntext", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
|
||||||
|
doc.add(new Field(DATE_FIELD, "date\r\ntext", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
|
||||||
|
return doc;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||||
|
|
||||||
private PerfRunData createPerfRunData(File file, boolean setBZCompress, String bz2CompressVal) throws Exception {
|
private PerfRunData createPerfRunData(File file, boolean setBZCompress,
|
||||||
|
String bz2CompressVal,
|
||||||
|
String docMakerName) throws Exception {
|
||||||
Properties props = new Properties();
|
Properties props = new Properties();
|
||||||
props.setProperty("doc.maker", WriteLineDocMaker.class.getName());
|
props.setProperty("doc.maker", docMakerName);
|
||||||
props.setProperty("line.file.out", file.getAbsolutePath());
|
props.setProperty("line.file.out", file.getAbsolutePath());
|
||||||
if (setBZCompress) {
|
if (setBZCompress) {
|
||||||
props.setProperty("bzip.compression", bz2CompressVal);
|
props.setProperty("bzip.compression", bz2CompressVal);
|
||||||
|
@ -74,7 +79,8 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
||||||
return new PerfRunData(config);
|
return new PerfRunData(config);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void doReadTest(File file, boolean bz2File) throws Exception {
|
private void doReadTest(File file, boolean bz2File, String expTitle,
|
||||||
|
String expDate, String expBody) throws Exception {
|
||||||
InputStream in = new FileInputStream(file);
|
InputStream in = new FileInputStream(file);
|
||||||
if (bz2File) {
|
if (bz2File) {
|
||||||
in = csFactory.createCompressorInputStream("bzip2", in);
|
in = csFactory.createCompressorInputStream("bzip2", in);
|
||||||
|
@ -85,9 +91,9 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
||||||
assertNotNull(line);
|
assertNotNull(line);
|
||||||
String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
|
String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
|
||||||
assertEquals(3, parts.length);
|
assertEquals(3, parts.length);
|
||||||
assertEquals("title", parts[0]);
|
assertEquals(expTitle, parts[0]);
|
||||||
assertEquals("date", parts[1]);
|
assertEquals(expDate, parts[1]);
|
||||||
assertEquals("body", parts[2]);
|
assertEquals(expBody, parts[2]);
|
||||||
assertNull(br.readLine());
|
assertNull(br.readLine());
|
||||||
} finally {
|
} finally {
|
||||||
br.close();
|
br.close();
|
||||||
|
@ -99,36 +105,48 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
||||||
|
|
||||||
// Create a document in bz2 format.
|
// Create a document in bz2 format.
|
||||||
File file = new File(getWorkDir(), "one-line.bz2");
|
File file = new File(getWorkDir(), "one-line.bz2");
|
||||||
PerfRunData runData = createPerfRunData(file, true, "true");
|
PerfRunData runData = createPerfRunData(file, true, "true", WriteLineDocMaker.class.getName());
|
||||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||||
wldt.doLogic();
|
wldt.doLogic();
|
||||||
wldt.close();
|
wldt.close();
|
||||||
|
|
||||||
doReadTest(file, true);
|
doReadTest(file, true, "title", "date", "body");
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBZip2AutoDetect() throws Exception {
|
public void testBZip2AutoDetect() throws Exception {
|
||||||
|
|
||||||
// Create a document in bz2 format.
|
// Create a document in bz2 format.
|
||||||
File file = new File(getWorkDir(), "one-line.bz2");
|
File file = new File(getWorkDir(), "one-line.bz2");
|
||||||
PerfRunData runData = createPerfRunData(file, false, null);
|
PerfRunData runData = createPerfRunData(file, false, null, WriteLineDocMaker.class.getName());
|
||||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||||
wldt.doLogic();
|
wldt.doLogic();
|
||||||
wldt.close();
|
wldt.close();
|
||||||
|
|
||||||
doReadTest(file, true);
|
doReadTest(file, true, "title", "date", "body");
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRegularFile() throws Exception {
|
public void testRegularFile() throws Exception {
|
||||||
|
|
||||||
// Create a document in regular format.
|
// Create a document in regular format.
|
||||||
File file = new File(getWorkDir(), "one-line");
|
File file = new File(getWorkDir(), "one-line");
|
||||||
PerfRunData runData = createPerfRunData(file, true, "false");
|
PerfRunData runData = createPerfRunData(file, true, "false", WriteLineDocMaker.class.getName());
|
||||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||||
wldt.doLogic();
|
wldt.doLogic();
|
||||||
wldt.close();
|
wldt.close();
|
||||||
|
|
||||||
doReadTest(file, false);
|
doReadTest(file, false, "title", "date", "body");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCharsReplace() throws Exception {
|
||||||
|
// WriteLineDocTask replaced only \t characters w/ a space, since that's its
|
||||||
|
// separator char. However, it didn't replace newline characters, which
|
||||||
|
// resulted in errors in LineDocMaker.
|
||||||
|
File file = new File(getWorkDir(), "one-line");
|
||||||
|
PerfRunData runData = createPerfRunData(file, false, null, NewLinesDocMaker.class.getName());
|
||||||
|
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||||
|
wldt.doLogic();
|
||||||
|
wldt.close();
|
||||||
|
|
||||||
|
doReadTest(file, false, "title text", "date text", "body text two");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue