mirror of https://github.com/apache/lucene.git
LUCENE-1479: if date is missing, don't skip the doc; just don't add 'docdate' field to the doc
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@733697 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f0300a643e
commit
03d6b2aa24
|
@ -33,7 +33,6 @@ import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A DocMaker using the (compressed) Trec collection for its input.
|
* A DocMaker using the (compressed) Trec collection for its input.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -44,7 +43,14 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
*/
|
*/
|
||||||
public class TrecDocMaker extends BasicDocMaker {
|
public class TrecDocMaker extends BasicDocMaker {
|
||||||
|
|
||||||
private static final String newline = System.getProperty("line.separator");
|
private static final String DATE = "Date: ";
|
||||||
|
private static final String DOCHDR = "<DOCHDR>";
|
||||||
|
private static final String TERM_DOCHDR = "</DOCHDR>";
|
||||||
|
private static final String TERM_DOCNO = "</DOCNO>";
|
||||||
|
private static final String DOCNO = "<DOCNO>";
|
||||||
|
private static final String TERM_DOC = "</DOC>";
|
||||||
|
private static final String DOC = "<DOC>";
|
||||||
|
private static final String NEW_LINE = System.getProperty("line.separator");
|
||||||
|
|
||||||
protected ThreadLocal dateFormat = new ThreadLocal();
|
protected ThreadLocal dateFormat = new ThreadLocal();
|
||||||
protected File dataDir = null;
|
protected File dataDir = null;
|
||||||
|
@ -135,25 +141,37 @@ public class TrecDocMaker extends BasicDocMaker {
|
||||||
}
|
}
|
||||||
|
|
||||||
// read until finding a line that starts with the specified prefix
|
// read until finding a line that starts with the specified prefix
|
||||||
protected StringBuffer read (String prefix, StringBuffer sb, boolean collectMatchLine, boolean collectAll) throws Exception {
|
protected StringBuffer read(String prefix, StringBuffer sb,
|
||||||
|
boolean collectMatchLine, boolean collectAll,
|
||||||
|
String terminatingTag) throws Exception {
|
||||||
sb = (sb==null ? new StringBuffer() : sb);
|
sb = (sb==null ? new StringBuffer() : sb);
|
||||||
String sep = "";
|
String sep = "";
|
||||||
while (true) {
|
while (true) {
|
||||||
String line = reader.readLine();
|
String line = reader.readLine();
|
||||||
if (line==null) {
|
if (line == null) {
|
||||||
openNextFile();
|
openNextFile();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (line.startsWith(prefix)) {
|
if (line.startsWith(prefix)) {
|
||||||
if (collectMatchLine) {
|
if (collectMatchLine) {
|
||||||
sb.append(sep+line);
|
sb.append(sep).append(line);
|
||||||
sep = newline;
|
sep = NEW_LINE;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (terminatingTag != null && line.startsWith(terminatingTag)) {
|
||||||
|
// didn't find the prefix that was asked, but the terminating
|
||||||
|
// tag was found. set the length to 0 to signal no match was
|
||||||
|
// found.
|
||||||
|
sb.setLength(0);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if (collectAll) {
|
if (collectAll) {
|
||||||
sb.append(sep+line);
|
sb.append(sep).append(line);
|
||||||
sep = newline;
|
sep = NEW_LINE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//System.out.println("read: "+sb);
|
//System.out.println("read: "+sb);
|
||||||
|
@ -165,22 +183,31 @@ public class TrecDocMaker extends BasicDocMaker {
|
||||||
openNextFile();
|
openNextFile();
|
||||||
}
|
}
|
||||||
// 1. skip until doc start
|
// 1. skip until doc start
|
||||||
read("<DOC>",null,false,false);
|
read(DOC,null,false,false,null);
|
||||||
// 2. name
|
// 2. name
|
||||||
StringBuffer sb = read("<DOCNO>",null,true,false);
|
StringBuffer sb = read(DOCNO,null,true,false,null);
|
||||||
String name = sb.substring("<DOCNO>".length());
|
String name = sb.substring(DOCNO.length(), sb.indexOf(TERM_DOCNO, DOCNO.length()));
|
||||||
name = name.substring(0,name.indexOf("</DOCNO>"))+"_"+iteration;
|
name = name + "_" + iteration;
|
||||||
// 3. skip until doc header
|
// 3. skip until doc header
|
||||||
read("<DOCHDR>",null,false,false);
|
read(DOCHDR,null,false,false,null);
|
||||||
|
boolean findTerminatingDocHdr = false;
|
||||||
// 4. date
|
// 4. date
|
||||||
sb = read("Date: ",null,true,false);
|
sb = read(DATE,null,true,false,TERM_DOCHDR);
|
||||||
String dateStr = sb.substring("Date: ".length());
|
String dateStr = null;
|
||||||
|
if (sb.length() != 0) {
|
||||||
|
// Date found.
|
||||||
|
dateStr = sb.substring(DATE.length());
|
||||||
|
findTerminatingDocHdr = true;
|
||||||
|
}
|
||||||
|
|
||||||
// 5. skip until end of doc header
|
// 5. skip until end of doc header
|
||||||
read("</DOCHDR>",null,false,false);
|
if (findTerminatingDocHdr) {
|
||||||
|
read(TERM_DOCHDR,null,false,false,null);
|
||||||
|
}
|
||||||
// 6. collect until end of doc
|
// 6. collect until end of doc
|
||||||
sb = read("</DOC>",null,false,true);
|
sb = read(TERM_DOC,null,false,true,null);
|
||||||
// this is the next document, so parse it
|
// this is the next document, so parse it
|
||||||
Date date = parseDate(dateStr);
|
Date date = dateStr != null ? parseDate(dateStr) : null;
|
||||||
HTMLParser p = getHtmlParser();
|
HTMLParser p = getHtmlParser();
|
||||||
DocData docData = p.parse(name, date, sb, getDateFormat(0));
|
DocData docData = p.parse(name, date, sb, getDateFormat(0));
|
||||||
addBytes(sb.length()); // count char length of parsed html text (larger than the plain doc body text).
|
addBytes(sb.length()); // count char length of parsed html text (larger than the plain doc body text).
|
||||||
|
@ -202,18 +229,14 @@ public class TrecDocMaker extends BasicDocMaker {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Date parseDate(String dateStr) {
|
protected Date parseDate(String dateStr) {
|
||||||
Date date = null;
|
for (int i = 0; i < DATE_FORMATS.length; i++) {
|
||||||
for (int i=0; i<DATE_FORMATS.length; i++) {
|
|
||||||
try {
|
try {
|
||||||
date = getDateFormat(i).parse(dateStr.trim());
|
return getDateFormat(i).parse(dateStr.trim());
|
||||||
return date;
|
} catch (ParseException e) {}
|
||||||
} catch (ParseException e) {
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// do not fail test just because a date could not be parsed
|
// do not fail test just because a date could not be parsed
|
||||||
System.out.println("ignoring date parse exception (assigning 'now') for: "+dateStr);
|
System.out.println("ignoring date parse exception (assigning 'null') for: "+dateStr);
|
||||||
date = new Date(); // now
|
return null;
|
||||||
return date;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,321 @@
|
||||||
|
package org.apache.lucene.benchmark.byTask.feeds;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.Date;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
public class TrecDocMakerTest extends TestCase {
|
||||||
|
|
||||||
|
/** A TrecDocMaker which works on a String and not files. */
|
||||||
|
private static class StringableTrecDocMaker extends TrecDocMaker {
|
||||||
|
|
||||||
|
private String docs = null;
|
||||||
|
|
||||||
|
public StringableTrecDocMaker(String docs, boolean forever) {
|
||||||
|
this.docs = docs;
|
||||||
|
this.forever = forever;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void openNextFile() throws NoMoreDataException, Exception {
|
||||||
|
if (reader != null) {
|
||||||
|
if (!forever) {
|
||||||
|
throw new NoMoreDataException();
|
||||||
|
}
|
||||||
|
++iteration;
|
||||||
|
}
|
||||||
|
|
||||||
|
reader = new BufferedReader(new StringReader(docs));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertDocData(DocData dd, String expName, String expTitle, String expBody, Date expDate) {
|
||||||
|
assertNotNull(dd);
|
||||||
|
assertEquals(expName, dd.getName());
|
||||||
|
assertEquals(expTitle, dd.getTitle());
|
||||||
|
assertTrue(dd.getBody().indexOf(expBody) != -1);
|
||||||
|
assertEquals(expDate, dd.getDate());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertNoMoreDataException(StringableTrecDocMaker stdm) throws Exception {
|
||||||
|
boolean thrown = false;
|
||||||
|
try {
|
||||||
|
stdm.getNextDocData();
|
||||||
|
} catch (NoMoreDataException e) {
|
||||||
|
thrown = true;
|
||||||
|
}
|
||||||
|
assertTrue("Expecting NoMoreDataException", thrown);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOneDocument() throws Exception {
|
||||||
|
String docs = "<DOC>\r\n" +
|
||||||
|
"<DOCNO>TEST-000</DOCNO>\r\n" +
|
||||||
|
"<DOCHDR>\r\n" +
|
||||||
|
"http://lucene.apache.org.trecdocmaker.test\r\n" +
|
||||||
|
"HTTP/1.1 200 OK\r\n" +
|
||||||
|
"Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
|
||||||
|
"Server: Apache/1.3.27 (Unix)\r\n" +
|
||||||
|
"Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
|
||||||
|
"Content-Length: 614\r\n" +
|
||||||
|
"Connection: close\r\n" +
|
||||||
|
"Content-Type: text/html\r\n" +
|
||||||
|
"</DOCHDR>\r\n" +
|
||||||
|
"<html>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"<head>\r\n" +
|
||||||
|
"<title>\r\n" +
|
||||||
|
"TEST-000 title\r\n" +
|
||||||
|
"</title>\r\n" +
|
||||||
|
"</head>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"<body>\r\n" +
|
||||||
|
"TEST-000 text\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"</body>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"</DOC>";
|
||||||
|
StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
|
||||||
|
stdm.setHTMLParser(new DemoHTMLParser());
|
||||||
|
|
||||||
|
DocData dd = stdm.getNextDocData();
|
||||||
|
assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm
|
||||||
|
.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
|
||||||
|
|
||||||
|
assertNoMoreDataException(stdm);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTwoDocuments() throws Exception {
|
||||||
|
String docs = "<DOC>\r\n" +
|
||||||
|
"<DOCNO>TEST-000</DOCNO>\r\n" +
|
||||||
|
"<DOCHDR>\r\n" +
|
||||||
|
"http://lucene.apache.org.trecdocmaker.test\r\n" +
|
||||||
|
"HTTP/1.1 200 OK\r\n" +
|
||||||
|
"Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
|
||||||
|
"Server: Apache/1.3.27 (Unix)\r\n" +
|
||||||
|
"Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
|
||||||
|
"Content-Length: 614\r\n" +
|
||||||
|
"Connection: close\r\n" +
|
||||||
|
"Content-Type: text/html\r\n" +
|
||||||
|
"</DOCHDR>\r\n" +
|
||||||
|
"<html>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"<head>\r\n" +
|
||||||
|
"<title>\r\n" +
|
||||||
|
"TEST-000 title\r\n" +
|
||||||
|
"</title>\r\n" +
|
||||||
|
"</head>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"<body>\r\n" +
|
||||||
|
"TEST-000 text\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"</body>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"</DOC>\r\n" +
|
||||||
|
"<DOC>\r\n" +
|
||||||
|
"<DOCNO>TEST-001</DOCNO>\r\n" +
|
||||||
|
"<DOCHDR>\r\n" +
|
||||||
|
"http://lucene.apache.org.trecdocmaker.test\r\n" +
|
||||||
|
"HTTP/1.1 200 OK\r\n" +
|
||||||
|
"Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" +
|
||||||
|
"Server: Apache/1.3.27 (Unix)\r\n" +
|
||||||
|
"Last-Modified: Sun, 11 Jan 2008 08:01:00 GMT\r\n" +
|
||||||
|
"Content-Length: 614\r\n" +
|
||||||
|
"Connection: close\r\n" +
|
||||||
|
"Content-Type: text/html\r\n" +
|
||||||
|
"</DOCHDR>\r\n" +
|
||||||
|
"<html>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"<head>\r\n" +
|
||||||
|
"<title>\r\n" +
|
||||||
|
"TEST-001 title\r\n" +
|
||||||
|
"</title>\r\n" +
|
||||||
|
"</head>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"<body>\r\n" +
|
||||||
|
"TEST-001 text\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"</body>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"</DOC>";
|
||||||
|
StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
|
||||||
|
stdm.setHTMLParser(new DemoHTMLParser());
|
||||||
|
|
||||||
|
DocData dd = stdm.getNextDocData();
|
||||||
|
assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm
|
||||||
|
.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
|
||||||
|
|
||||||
|
dd = stdm.getNextDocData();
|
||||||
|
assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", stdm
|
||||||
|
.parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
|
||||||
|
|
||||||
|
assertNoMoreDataException(stdm);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If a Date: attribute is missing, make sure the document is not skipped, but
|
||||||
|
// rather that null Data is assigned.
|
||||||
|
public void testMissingDate() throws Exception {
|
||||||
|
String docs = "<DOC>\r\n" +
|
||||||
|
"<DOCNO>TEST-000</DOCNO>\r\n" +
|
||||||
|
"<DOCHDR>\r\n" +
|
||||||
|
"http://lucene.apache.org.trecdocmaker.test\r\n" +
|
||||||
|
"HTTP/1.1 200 OK\r\n" +
|
||||||
|
"Server: Apache/1.3.27 (Unix)\r\n" +
|
||||||
|
"Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
|
||||||
|
"Content-Length: 614\r\n" +
|
||||||
|
"Connection: close\r\n" +
|
||||||
|
"Content-Type: text/html\r\n" +
|
||||||
|
"</DOCHDR>\r\n" +
|
||||||
|
"<html>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"<head>\r\n" +
|
||||||
|
"<title>\r\n" +
|
||||||
|
"TEST-000 title\r\n" +
|
||||||
|
"</title>\r\n" +
|
||||||
|
"</head>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"<body>\r\n" +
|
||||||
|
"TEST-000 text\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"</body>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"</DOC>\r\n" +
|
||||||
|
"<DOC>\r\n" +
|
||||||
|
"<DOCNO>TEST-001</DOCNO>\r\n" +
|
||||||
|
"<DOCHDR>\r\n" +
|
||||||
|
"http://lucene.apache.org.trecdocmaker.test\r\n" +
|
||||||
|
"HTTP/1.1 200 OK\r\n" +
|
||||||
|
"Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" +
|
||||||
|
"Server: Apache/1.3.27 (Unix)\r\n" +
|
||||||
|
"Last-Modified: Sun, 11 Jan 2009 08:01:00 GMT\r\n" +
|
||||||
|
"Content-Length: 614\r\n" +
|
||||||
|
"Connection: close\r\n" +
|
||||||
|
"Content-Type: text/html\r\n" +
|
||||||
|
"</DOCHDR>\r\n" +
|
||||||
|
"<html>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"<head>\r\n" +
|
||||||
|
"<title>\r\n" +
|
||||||
|
"TEST-001 title\r\n" +
|
||||||
|
"</title>\r\n" +
|
||||||
|
"</head>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"<body>\r\n" +
|
||||||
|
"TEST-001 text\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"</body>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"</DOC>";
|
||||||
|
StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
|
||||||
|
stdm.setHTMLParser(new DemoHTMLParser());
|
||||||
|
|
||||||
|
DocData dd = stdm.getNextDocData();
|
||||||
|
assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
|
||||||
|
|
||||||
|
dd = stdm.getNextDocData();
|
||||||
|
assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", stdm
|
||||||
|
.parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
|
||||||
|
|
||||||
|
assertNoMoreDataException(stdm);
|
||||||
|
}
|
||||||
|
|
||||||
|
// When a 'bad date' is input (unparsable date), make sure the DocData date is
|
||||||
|
// assigned null.
|
||||||
|
public void testBadDate() throws Exception {
|
||||||
|
String docs = "<DOC>\r\n" +
|
||||||
|
"<DOCNO>TEST-000</DOCNO>\r\n" +
|
||||||
|
"<DOCHDR>\r\n" +
|
||||||
|
"http://lucene.apache.org.trecdocmaker.test\r\n" +
|
||||||
|
"HTTP/1.1 200 OK\r\n" +
|
||||||
|
"Date: Bad Date\r\n" +
|
||||||
|
"Server: Apache/1.3.27 (Unix)\r\n" +
|
||||||
|
"Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
|
||||||
|
"Content-Length: 614\r\n" +
|
||||||
|
"Connection: close\r\n" +
|
||||||
|
"Content-Type: text/html\r\n" +
|
||||||
|
"</DOCHDR>\r\n" +
|
||||||
|
"<html>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"<head>\r\n" +
|
||||||
|
"<title>\r\n" +
|
||||||
|
"TEST-000 title\r\n" +
|
||||||
|
"</title>\r\n" +
|
||||||
|
"</head>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"<body>\r\n" +
|
||||||
|
"TEST-000 text\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"</body>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"</DOC>";
|
||||||
|
StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
|
||||||
|
stdm.setHTMLParser(new DemoHTMLParser());
|
||||||
|
|
||||||
|
DocData dd = stdm.getNextDocData();
|
||||||
|
assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
|
||||||
|
|
||||||
|
assertNoMoreDataException(stdm);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testForever() throws Exception {
|
||||||
|
String docs = "<DOC>\r\n" +
|
||||||
|
"<DOCNO>TEST-000</DOCNO>\r\n" +
|
||||||
|
"<DOCHDR>\r\n" +
|
||||||
|
"http://lucene.apache.org.trecdocmaker.test\r\n" +
|
||||||
|
"HTTP/1.1 200 OK\r\n" +
|
||||||
|
"Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
|
||||||
|
"Server: Apache/1.3.27 (Unix)\r\n" +
|
||||||
|
"Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
|
||||||
|
"Content-Length: 614\r\n" +
|
||||||
|
"Connection: close\r\n" +
|
||||||
|
"Content-Type: text/html\r\n" +
|
||||||
|
"</DOCHDR>\r\n" +
|
||||||
|
"<html>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"<head>\r\n" +
|
||||||
|
"<title>\r\n" +
|
||||||
|
"TEST-000 title\r\n" +
|
||||||
|
"</title>\r\n" +
|
||||||
|
"</head>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"<body>\r\n" +
|
||||||
|
"TEST-000 text\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"</body>\r\n" +
|
||||||
|
"\r\n" +
|
||||||
|
"</DOC>";
|
||||||
|
StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, true);
|
||||||
|
stdm.setHTMLParser(new DemoHTMLParser());
|
||||||
|
|
||||||
|
DocData dd = stdm.getNextDocData();
|
||||||
|
assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm
|
||||||
|
.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
|
||||||
|
|
||||||
|
// same document, but the second iteration changes the name.
|
||||||
|
dd = stdm.getNextDocData();
|
||||||
|
assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", stdm
|
||||||
|
.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
|
||||||
|
|
||||||
|
// Don't test that NoMoreDataException is thrown, since the forever flag is
|
||||||
|
// turned on.
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue