LUCENE-2059: allow TrecContentSource not to change the docname

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@835677 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2009-11-13 00:47:15 +00:00
parent 149a0c9b25
commit 731f418448
2 changed files with 12 additions and 1 deletions

View File

@ -4,6 +4,13 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$ $Id:$
11/12/2009
LUCENE-2059: allow TrecContentSource not to change the docname.
Previously, it would always append the iteration # to the docname.
With the new option content.source.excludeIteration, you can disable this.
The resulting index can then be used with the quality package to measure
relevance. (Robert Muir)
11/12/2009 11/12/2009
LUCENE-2058: specify trec_eval submission output from the command line. LUCENE-2058: specify trec_eval submission output from the command line.
Previously, 4 arguments were required, but the third was unused. The Previously, 4 arguments were required, but the third was unused. The

View File

@ -48,6 +48,7 @@ import org.apache.lucene.benchmark.byTask.utils.StringBufferReader;
* <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for * <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
* parsing the TREC documents content (<b>default=DemoHTMLParser</b>). * parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
* <li><b>content.source.encoding</b> - if not specified, ISO-8859-1 is used. * <li><b>content.source.encoding</b> - if not specified, ISO-8859-1 is used.
* <li><b>content.source.excludeIteration</b> - if true, do not append iteration number to docname
* </ul> * </ul>
*/ */
public class TrecContentSource extends ContentSource { public class TrecContentSource extends ContentSource {
@ -91,6 +92,7 @@ public class TrecContentSource extends ContentSource {
BufferedReader reader; BufferedReader reader;
int iteration = 0; int iteration = 0;
HTMLParser htmlParser; HTMLParser htmlParser;
private boolean excludeDocnameIteration;
private DateFormatInfo getDateFormatInfo() { private DateFormatInfo getDateFormatInfo() {
DateFormatInfo dfi = dateFormats.get(); DateFormatInfo dfi = dateFormats.get();
@ -256,6 +258,7 @@ public class TrecContentSource extends ContentSource {
read(docBuf, DOCNO, true, false, null); read(docBuf, DOCNO, true, false, null);
name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO, name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO,
DOCNO.length())); DOCNO.length()));
if (!excludeDocnameIteration)
name = name + "_" + iteration; name = name + "_" + iteration;
// 3. skip until doc header // 3. skip until doc header
@ -342,6 +345,7 @@ public class TrecContentSource extends ContentSource {
if (encoding == null) { if (encoding == null) {
encoding = "ISO-8859-1"; encoding = "ISO-8859-1";
} }
excludeDocnameIteration = config.get("content.source.excludeIteration", false);
} }
} }