From 03d6b2aa24cd97173e4b042e9ef6f34d4bfe9a41 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Mon, 12 Jan 2009 11:37:23 +0000 Subject: [PATCH] LUCENE-1479: if date is missing, don't skip the doc; just don't add 'docdate' field to the doc git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@733697 13f79535-47bb-0310-9956-ffa450edef68 --- .../benchmark/byTask/feeds/TrecDocMaker.java | 77 +++-- .../byTask/feeds/TrecDocMakerTest.java | 321 ++++++++++++++++++ 2 files changed, 371 insertions(+), 27 deletions(-) create mode 100644 contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecDocMakerTest.java diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java index 566020addb0..38494c163ef 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java @@ -33,7 +33,6 @@ import java.util.zip.GZIPInputStream; import org.apache.lucene.benchmark.byTask.utils.Config; - /** * A DocMaker using the (compressed) Trec collection for its input. *

@@ -44,7 +43,14 @@ import org.apache.lucene.benchmark.byTask.utils.Config; */ public class TrecDocMaker extends BasicDocMaker { - private static final String newline = System.getProperty("line.separator"); + private static final String DATE = "Date: "; + private static final String DOCHDR = ""; + private static final String TERM_DOCHDR = ""; + private static final String TERM_DOCNO = ""; + private static final String DOCNO = ""; + private static final String TERM_DOC = ""; + private static final String DOC = ""; + private static final String NEW_LINE = System.getProperty("line.separator"); protected ThreadLocal dateFormat = new ThreadLocal(); protected File dataDir = null; @@ -135,25 +141,37 @@ public class TrecDocMaker extends BasicDocMaker { } // read until finding a line that starts with the specified prefix - protected StringBuffer read (String prefix, StringBuffer sb, boolean collectMatchLine, boolean collectAll) throws Exception { + protected StringBuffer read(String prefix, StringBuffer sb, + boolean collectMatchLine, boolean collectAll, + String terminatingTag) throws Exception { sb = (sb==null ? new StringBuffer() : sb); String sep = ""; while (true) { String line = reader.readLine(); - if (line==null) { + if (line == null) { openNextFile(); continue; } if (line.startsWith(prefix)) { if (collectMatchLine) { - sb.append(sep+line); - sep = newline; + sb.append(sep).append(line); + sep = NEW_LINE; } break; } + + if (terminatingTag != null && line.startsWith(terminatingTag)) { + // didn't find the prefix that was asked, but the terminating + // tag was found. set the length to 0 to signal no match was + // found. + sb.setLength(0); + break; + } + + if (collectAll) { - sb.append(sep+line); - sep = newline; + sb.append(sep).append(line); + sep = NEW_LINE; } } //System.out.println("read: "+sb); @@ -165,22 +183,31 @@ public class TrecDocMaker extends BasicDocMaker { openNextFile(); } // 1. skip until doc start - read("",null,false,false); + read(DOC,null,false,false,null); // 2. name - StringBuffer sb = read("",null,true,false); - String name = sb.substring("".length()); - name = name.substring(0,name.indexOf(""))+"_"+iteration; + StringBuffer sb = read(DOCNO,null,true,false,null); + String name = sb.substring(DOCNO.length(), sb.indexOf(TERM_DOCNO, DOCNO.length())); + name = name + "_" + iteration; // 3. skip until doc header - read("",null,false,false); + read(DOCHDR,null,false,false,null); + boolean findTerminatingDocHdr = false; // 4. date - sb = read("Date: ",null,true,false); - String dateStr = sb.substring("Date: ".length()); + sb = read(DATE,null,true,false,TERM_DOCHDR); + String dateStr = null; + if (sb.length() != 0) { + // Date found. + dateStr = sb.substring(DATE.length()); + findTerminatingDocHdr = true; + } + // 5. skip until end of doc header - read("",null,false,false); + if (findTerminatingDocHdr) { + read(TERM_DOCHDR,null,false,false,null); + } // 6. collect until end of doc - sb = read("",null,false,true); + sb = read(TERM_DOC,null,false,true,null); // this is the next document, so parse it - Date date = parseDate(dateStr); + Date date = dateStr != null ? parseDate(dateStr) : null; HTMLParser p = getHtmlParser(); DocData docData = p.parse(name, date, sb, getDateFormat(0)); addBytes(sb.length()); // count char length of parsed html text (larger than the plain doc body text). @@ -202,18 +229,14 @@ public class TrecDocMaker extends BasicDocMaker { } protected Date parseDate(String dateStr) { - Date date = null; - for (int i=0; iTEST-000\r\n" + + "\r\n" + + "http://lucene.apache.org.trecdocmaker.test\r\n" + + "HTTP/1.1 200 OK\r\n" + + "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + + "Server: Apache/1.3.27 (Unix)\r\n" + + "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + + "Content-Length: 614\r\n" + + "Connection: close\r\n" + + "Content-Type: text/html\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-000 title\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-000 text\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + ""; + StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false); + stdm.setHTMLParser(new DemoHTMLParser()); + + DocData dd = stdm.getNextDocData(); + assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm + .parseDate("Sun, 11 Jan 2009 08:00:00 GMT")); + + assertNoMoreDataException(stdm); + } + + public void testTwoDocuments() throws Exception { + String docs = "\r\n" + + "TEST-000\r\n" + + "\r\n" + + "http://lucene.apache.org.trecdocmaker.test\r\n" + + "HTTP/1.1 200 OK\r\n" + + "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + + "Server: Apache/1.3.27 (Unix)\r\n" + + "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + + "Content-Length: 614\r\n" + + "Connection: close\r\n" + + "Content-Type: text/html\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-000 title\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-000 text\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-001\r\n" + + "\r\n" + + "http://lucene.apache.org.trecdocmaker.test\r\n" + + "HTTP/1.1 200 OK\r\n" + + "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" + + "Server: Apache/1.3.27 (Unix)\r\n" + + "Last-Modified: Sun, 11 Jan 2008 08:01:00 GMT\r\n" + + "Content-Length: 614\r\n" + + "Connection: close\r\n" + + "Content-Type: text/html\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-001 title\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-001 text\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + ""; + StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false); + stdm.setHTMLParser(new DemoHTMLParser()); + + DocData dd = stdm.getNextDocData(); + assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm + .parseDate("Sun, 11 Jan 2009 08:00:00 GMT")); + + dd = stdm.getNextDocData(); + assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", stdm + .parseDate("Sun, 11 Jan 2009 08:01:00 GMT")); + + assertNoMoreDataException(stdm); + } + + // If a Date: attribute is missing, make sure the document is not skipped, but + // rather that null Data is assigned. + public void testMissingDate() throws Exception { + String docs = "\r\n" + + "TEST-000\r\n" + + "\r\n" + + "http://lucene.apache.org.trecdocmaker.test\r\n" + + "HTTP/1.1 200 OK\r\n" + + "Server: Apache/1.3.27 (Unix)\r\n" + + "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + + "Content-Length: 614\r\n" + + "Connection: close\r\n" + + "Content-Type: text/html\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-000 title\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-000 text\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-001\r\n" + + "\r\n" + + "http://lucene.apache.org.trecdocmaker.test\r\n" + + "HTTP/1.1 200 OK\r\n" + + "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" + + "Server: Apache/1.3.27 (Unix)\r\n" + + "Last-Modified: Sun, 11 Jan 2009 08:01:00 GMT\r\n" + + "Content-Length: 614\r\n" + + "Connection: close\r\n" + + "Content-Type: text/html\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-001 title\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-001 text\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + ""; + StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false); + stdm.setHTMLParser(new DemoHTMLParser()); + + DocData dd = stdm.getNextDocData(); + assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null); + + dd = stdm.getNextDocData(); + assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", stdm + .parseDate("Sun, 11 Jan 2009 08:01:00 GMT")); + + assertNoMoreDataException(stdm); + } + + // When a 'bad date' is input (unparsable date), make sure the DocData date is + // assigned null. + public void testBadDate() throws Exception { + String docs = "\r\n" + + "TEST-000\r\n" + + "\r\n" + + "http://lucene.apache.org.trecdocmaker.test\r\n" + + "HTTP/1.1 200 OK\r\n" + + "Date: Bad Date\r\n" + + "Server: Apache/1.3.27 (Unix)\r\n" + + "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + + "Content-Length: 614\r\n" + + "Connection: close\r\n" + + "Content-Type: text/html\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-000 title\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-000 text\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + ""; + StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false); + stdm.setHTMLParser(new DemoHTMLParser()); + + DocData dd = stdm.getNextDocData(); + assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null); + + assertNoMoreDataException(stdm); + } + + public void testForever() throws Exception { + String docs = "\r\n" + + "TEST-000\r\n" + + "\r\n" + + "http://lucene.apache.org.trecdocmaker.test\r\n" + + "HTTP/1.1 200 OK\r\n" + + "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + + "Server: Apache/1.3.27 (Unix)\r\n" + + "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + + "Content-Length: 614\r\n" + + "Connection: close\r\n" + + "Content-Type: text/html\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-000 title\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-000 text\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + ""; + StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, true); + stdm.setHTMLParser(new DemoHTMLParser()); + + DocData dd = stdm.getNextDocData(); + assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm + .parseDate("Sun, 11 Jan 2009 08:00:00 GMT")); + + // same document, but the second iteration changes the name. + dd = stdm.getNextDocData(); + assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", stdm + .parseDate("Sun, 11 Jan 2009 08:00:00 GMT")); + + // Don't test that NoMoreDataException is thrown, since the forever flag is + // turned on. + } + +}