diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java
index 566020addb0..38494c163ef 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java
@@ -33,7 +33,6 @@ import java.util.zip.GZIPInputStream;
import org.apache.lucene.benchmark.byTask.utils.Config;
-
/**
* A DocMaker using the (compressed) Trec collection for its input.
*
@@ -44,7 +43,14 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
*/
public class TrecDocMaker extends BasicDocMaker {
- private static final String newline = System.getProperty("line.separator");
+ private static final String DATE = "Date: ";
+ private static final String DOCHDR = "";
+ private static final String TERM_DOCHDR = "";
+ private static final String TERM_DOCNO = "";
+ private static final String DOCNO = "";
+ private static final String TERM_DOC = "";
+ private static final String DOC = "";
+ private static final String NEW_LINE = System.getProperty("line.separator");
protected ThreadLocal dateFormat = new ThreadLocal();
protected File dataDir = null;
@@ -135,25 +141,37 @@ public class TrecDocMaker extends BasicDocMaker {
}
// read until finding a line that starts with the specified prefix
- protected StringBuffer read (String prefix, StringBuffer sb, boolean collectMatchLine, boolean collectAll) throws Exception {
+ protected StringBuffer read(String prefix, StringBuffer sb,
+ boolean collectMatchLine, boolean collectAll,
+ String terminatingTag) throws Exception {
sb = (sb==null ? new StringBuffer() : sb);
String sep = "";
while (true) {
String line = reader.readLine();
- if (line==null) {
+ if (line == null) {
openNextFile();
continue;
}
if (line.startsWith(prefix)) {
if (collectMatchLine) {
- sb.append(sep+line);
- sep = newline;
+ sb.append(sep).append(line);
+ sep = NEW_LINE;
}
break;
}
+
+ if (terminatingTag != null && line.startsWith(terminatingTag)) {
+ // didn't find the prefix that was asked, but the terminating
+ // tag was found. set the length to 0 to signal no match was
+ // found.
+ sb.setLength(0);
+ break;
+ }
+
+
if (collectAll) {
- sb.append(sep+line);
- sep = newline;
+ sb.append(sep).append(line);
+ sep = NEW_LINE;
}
}
//System.out.println("read: "+sb);
@@ -165,22 +183,31 @@ public class TrecDocMaker extends BasicDocMaker {
openNextFile();
}
// 1. skip until doc start
- read("",null,false,false);
+ read(DOC,null,false,false,null);
// 2. name
- StringBuffer sb = read("",null,true,false);
- String name = sb.substring("".length());
- name = name.substring(0,name.indexOf(""))+"_"+iteration;
+ StringBuffer sb = read(DOCNO,null,true,false,null);
+ String name = sb.substring(DOCNO.length(), sb.indexOf(TERM_DOCNO, DOCNO.length()));
+ name = name + "_" + iteration;
// 3. skip until doc header
- read("",null,false,false);
+ read(DOCHDR,null,false,false,null);
+ boolean findTerminatingDocHdr = false;
// 4. date
- sb = read("Date: ",null,true,false);
- String dateStr = sb.substring("Date: ".length());
+ sb = read(DATE,null,true,false,TERM_DOCHDR);
+ String dateStr = null;
+ if (sb.length() != 0) {
+ // Date found.
+ dateStr = sb.substring(DATE.length());
+ findTerminatingDocHdr = true;
+ }
+
// 5. skip until end of doc header
- read("",null,false,false);
+ if (findTerminatingDocHdr) {
+ read(TERM_DOCHDR,null,false,false,null);
+ }
// 6. collect until end of doc
- sb = read("",null,false,true);
+ sb = read(TERM_DOC,null,false,true,null);
// this is the next document, so parse it
- Date date = parseDate(dateStr);
+ Date date = dateStr != null ? parseDate(dateStr) : null;
HTMLParser p = getHtmlParser();
DocData docData = p.parse(name, date, sb, getDateFormat(0));
addBytes(sb.length()); // count char length of parsed html text (larger than the plain doc body text).
@@ -202,18 +229,14 @@ public class TrecDocMaker extends BasicDocMaker {
}
protected Date parseDate(String dateStr) {
- Date date = null;
- for (int i=0; iTEST-000\r\n" +
+ "\r\n" +
+ "http://lucene.apache.org.trecdocmaker.test\r\n" +
+ "HTTP/1.1 200 OK\r\n" +
+ "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Server: Apache/1.3.27 (Unix)\r\n" +
+ "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Content-Length: 614\r\n" +
+ "Connection: close\r\n" +
+ "Content-Type: text/html\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "
\r\n" +
+ "\r\n" +
+ "TEST-000 title\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 text\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "";
+ StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
+ stdm.setHTMLParser(new DemoHTMLParser());
+
+ DocData dd = stdm.getNextDocData();
+ assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm
+ .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+
+ assertNoMoreDataException(stdm);
+ }
+
+ public void testTwoDocuments() throws Exception {
+ String docs = "\r\n" +
+ "TEST-000\r\n" +
+ "\r\n" +
+ "http://lucene.apache.org.trecdocmaker.test\r\n" +
+ "HTTP/1.1 200 OK\r\n" +
+ "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Server: Apache/1.3.27 (Unix)\r\n" +
+ "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Content-Length: 614\r\n" +
+ "Connection: close\r\n" +
+ "Content-Type: text/html\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 title\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 text\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-001\r\n" +
+ "\r\n" +
+ "http://lucene.apache.org.trecdocmaker.test\r\n" +
+ "HTTP/1.1 200 OK\r\n" +
+ "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" +
+ "Server: Apache/1.3.27 (Unix)\r\n" +
+ "Last-Modified: Sun, 11 Jan 2008 08:01:00 GMT\r\n" +
+ "Content-Length: 614\r\n" +
+ "Connection: close\r\n" +
+ "Content-Type: text/html\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-001 title\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-001 text\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "";
+ StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
+ stdm.setHTMLParser(new DemoHTMLParser());
+
+ DocData dd = stdm.getNextDocData();
+ assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm
+ .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+
+ dd = stdm.getNextDocData();
+ assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", stdm
+ .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
+
+ assertNoMoreDataException(stdm);
+ }
+
+ // If a Date: attribute is missing, make sure the document is not skipped, but
+ // rather that null Data is assigned.
+ public void testMissingDate() throws Exception {
+ String docs = "\r\n" +
+ "TEST-000\r\n" +
+ "\r\n" +
+ "http://lucene.apache.org.trecdocmaker.test\r\n" +
+ "HTTP/1.1 200 OK\r\n" +
+ "Server: Apache/1.3.27 (Unix)\r\n" +
+ "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Content-Length: 614\r\n" +
+ "Connection: close\r\n" +
+ "Content-Type: text/html\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 title\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 text\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-001\r\n" +
+ "\r\n" +
+ "http://lucene.apache.org.trecdocmaker.test\r\n" +
+ "HTTP/1.1 200 OK\r\n" +
+ "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" +
+ "Server: Apache/1.3.27 (Unix)\r\n" +
+ "Last-Modified: Sun, 11 Jan 2009 08:01:00 GMT\r\n" +
+ "Content-Length: 614\r\n" +
+ "Connection: close\r\n" +
+ "Content-Type: text/html\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-001 title\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-001 text\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "";
+ StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
+ stdm.setHTMLParser(new DemoHTMLParser());
+
+ DocData dd = stdm.getNextDocData();
+ assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
+
+ dd = stdm.getNextDocData();
+ assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", stdm
+ .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
+
+ assertNoMoreDataException(stdm);
+ }
+
+ // When a 'bad date' is input (unparsable date), make sure the DocData date is
+ // assigned null.
+ public void testBadDate() throws Exception {
+ String docs = "\r\n" +
+ "TEST-000\r\n" +
+ "\r\n" +
+ "http://lucene.apache.org.trecdocmaker.test\r\n" +
+ "HTTP/1.1 200 OK\r\n" +
+ "Date: Bad Date\r\n" +
+ "Server: Apache/1.3.27 (Unix)\r\n" +
+ "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Content-Length: 614\r\n" +
+ "Connection: close\r\n" +
+ "Content-Type: text/html\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 title\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 text\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "";
+ StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
+ stdm.setHTMLParser(new DemoHTMLParser());
+
+ DocData dd = stdm.getNextDocData();
+ assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
+
+ assertNoMoreDataException(stdm);
+ }
+
+ public void testForever() throws Exception {
+ String docs = "\r\n" +
+ "TEST-000\r\n" +
+ "\r\n" +
+ "http://lucene.apache.org.trecdocmaker.test\r\n" +
+ "HTTP/1.1 200 OK\r\n" +
+ "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Server: Apache/1.3.27 (Unix)\r\n" +
+ "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
+ "Content-Length: 614\r\n" +
+ "Connection: close\r\n" +
+ "Content-Type: text/html\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 title\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "TEST-000 text\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "\r\n" +
+ "";
+ StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, true);
+ stdm.setHTMLParser(new DemoHTMLParser());
+
+ DocData dd = stdm.getNextDocData();
+ assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm
+ .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+
+ // same document, but the second iteration changes the name.
+ dd = stdm.getNextDocData();
+ assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", stdm
+ .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+
+ // Don't test that NoMoreDataException is thrown, since the forever flag is
+ // turned on.
+ }
+
+}