From 5f72065d0f4be2b861e608ee6ff3415843e3a724 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Tue, 23 Jun 2009 16:46:17 +0000 Subject: [PATCH] LUCENE-1714: fix WriteLineDocTask to also replace \r, \n (in addition to \t) with space so those chars don't create mal-formed lines git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@787750 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/benchmark/CHANGES.txt | 5 ++ .../byTask/tasks/WriteLineDocTask.java | 13 ++-- .../byTask/tasks/WriteLineDocTaskTest.java | 60 ++++++++++++------- 3 files changed, 53 insertions(+), 25 deletions(-) diff --git a/contrib/benchmark/CHANGES.txt b/contrib/benchmark/CHANGES.txt index bebbd350e57..f44992101a5 100644 --- a/contrib/benchmark/CHANGES.txt +++ b/contrib/benchmark/CHANGES.txt @@ -3,6 +3,11 @@ Lucene Benchmark Contrib Change Log The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways. $Id:$ +6/23/09 + LUCENE-1714: WriteLineDocTask incorrectly normalized text, by replacing only + occurrences of "\t" with a space. It now replaces "\r\n" in addition to that, + so that LineDocMaker won't fail. (Shai Erera via Michael McCandless) + 6/17/09 LUCENE-1595: This issue breaks previous external algorithms. DocMaker has been replaced with a concrete class which accepts a ContentSource for iterating over diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java index 70d64be470d..cb6a6c0d986 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java @@ -22,6 +22,8 @@ import java.io.BufferedWriter; import java.io.FileOutputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.lucene.benchmark.byTask.PerfRunData; @@ -36,7 +38,6 @@ import org.apache.lucene.document.Field; * taske can be consumed by * {@link org.apache.lucene.benchmark.byTask.feeds.LineDocMaker} and is intended * to save the IO overhead of opening a file per doument to be indexed.
- * * Supports the following parameters: * + * NOTE: this class is not thread-safe and if used by multiple threads the + * output is unspecified (as all will write to the same ouput file in a + * non-synchronized way). */ public class WriteLineDocTask extends PerfTask { public final static char SEP = '\t'; + private static final Matcher NORMALIZER = Pattern.compile("[\t\r\n]+").matcher(""); private int docSize = 0; private BufferedWriter lineFileOut = null; @@ -92,14 +97,14 @@ public class WriteLineDocTask extends PerfTask { Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument(); Field f = doc.getField(DocMaker.BODY_FIELD); - String body = f != null ? f.stringValue().replace('\t', ' ') : null; + String body = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : null; if (body != null) { f = doc.getField(DocMaker.TITLE_FIELD); - String title = f != null ? f.stringValue().replace('\t', ' ') : ""; + String title = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : ""; f = doc.getField(DocMaker.DATE_FIELD); - String date = f != null ? f.stringValue().replace('\t', ' ') : ""; + String date = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : ""; lineFileOut.write(title, 0, title.length()); lineFileOut.write(SEP); diff --git a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java index dfc7967a1e1..8c4f37f7d2a 100644 --- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java +++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java @@ -27,9 +27,7 @@ import java.util.Properties; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.lucene.benchmark.BenchmarkTestCase; import org.apache.lucene.benchmark.byTask.PerfRunData; -import org.apache.lucene.benchmark.byTask.feeds.DocData; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; -import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -42,10 +40,6 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase { // class has to be public so that Class.forName.newInstance() will work public static final class WriteLineDocMaker extends DocMaker { - protected DocData getNextDocData() throws NoMoreDataException, Exception { - throw new UnsupportedOperationException("not implemented"); - } - public Document makeDocument() throws Exception { Document doc = new Document(); doc.add(new Field(BODY_FIELD, "body", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); @@ -54,17 +48,28 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase { return doc; } - public int numUniqueTexts() { - return 0; + } + + // class has to be public so that Class.forName.newInstance() will work + public static final class NewLinesDocMaker extends DocMaker { + + public Document makeDocument() throws Exception { + Document doc = new Document(); + doc.add(new Field(BODY_FIELD, "body\r\ntext\ttwo", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); + doc.add(new Field(TITLE_FIELD, "title\r\ntext", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); + doc.add(new Field(DATE_FIELD, "date\r\ntext", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); + return doc; } } private static final CompressorStreamFactory csFactory = new CompressorStreamFactory(); - private PerfRunData createPerfRunData(File file, boolean setBZCompress, String bz2CompressVal) throws Exception { + private PerfRunData createPerfRunData(File file, boolean setBZCompress, + String bz2CompressVal, + String docMakerName) throws Exception { Properties props = new Properties(); - props.setProperty("doc.maker", WriteLineDocMaker.class.getName()); + props.setProperty("doc.maker", docMakerName); props.setProperty("line.file.out", file.getAbsolutePath()); if (setBZCompress) { props.setProperty("bzip.compression", bz2CompressVal); @@ -74,7 +79,8 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase { return new PerfRunData(config); } - private void doReadTest(File file, boolean bz2File) throws Exception { + private void doReadTest(File file, boolean bz2File, String expTitle, + String expDate, String expBody) throws Exception { InputStream in = new FileInputStream(file); if (bz2File) { in = csFactory.createCompressorInputStream("bzip2", in); @@ -85,9 +91,9 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase { assertNotNull(line); String[] parts = line.split(Character.toString(WriteLineDocTask.SEP)); assertEquals(3, parts.length); - assertEquals("title", parts[0]); - assertEquals("date", parts[1]); - assertEquals("body", parts[2]); + assertEquals(expTitle, parts[0]); + assertEquals(expDate, parts[1]); + assertEquals(expBody, parts[2]); assertNull(br.readLine()); } finally { br.close(); @@ -99,36 +105,48 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase { // Create a document in bz2 format. File file = new File(getWorkDir(), "one-line.bz2"); - PerfRunData runData = createPerfRunData(file, true, "true"); + PerfRunData runData = createPerfRunData(file, true, "true", WriteLineDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); - doReadTest(file, true); + doReadTest(file, true, "title", "date", "body"); } public void testBZip2AutoDetect() throws Exception { // Create a document in bz2 format. File file = new File(getWorkDir(), "one-line.bz2"); - PerfRunData runData = createPerfRunData(file, false, null); + PerfRunData runData = createPerfRunData(file, false, null, WriteLineDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); - doReadTest(file, true); + doReadTest(file, true, "title", "date", "body"); } public void testRegularFile() throws Exception { // Create a document in regular format. File file = new File(getWorkDir(), "one-line"); - PerfRunData runData = createPerfRunData(file, true, "false"); + PerfRunData runData = createPerfRunData(file, true, "false", WriteLineDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); - doReadTest(file, false); + doReadTest(file, false, "title", "date", "body"); + } + + public void testCharsReplace() throws Exception { + // WriteLineDocTask replaced only \t characters w/ a space, since that's its + // separator char. However, it didn't replace newline characters, which + // resulted in errors in LineDocMaker. + File file = new File(getWorkDir(), "one-line"); + PerfRunData runData = createPerfRunData(file, false, null, NewLinesDocMaker.class.getName()); + WriteLineDocTask wldt = new WriteLineDocTask(runData); + wldt.doLogic(); + wldt.close(); + + doReadTest(file, false, "title text", "date text", "body text two"); } - }