mirror of https://github.com/apache/lucene.git
LUCENE-2059: allow TrecContentSource not to change the docname
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@835677 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
149a0c9b25
commit
731f418448
|
@ -4,6 +4,13 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
|
|||
|
||||
$Id:$
|
||||
|
||||
11/12/2009
|
||||
LUCENE-2059: allow TrecContentSource not to change the docname.
|
||||
Previously, it would always append the iteration # to the docname.
|
||||
With the new option content.source.excludeIteration, you can disable this.
|
||||
The resulting index can then be used with the quality package to measure
|
||||
relevance. (Robert Muir)
|
||||
|
||||
11/12/2009
|
||||
LUCENE-2058: specify trec_eval submission output from the command line.
|
||||
Previously, 4 arguments were required, but the third was unused. The
|
||||
|
|
|
@ -48,6 +48,7 @@ import org.apache.lucene.benchmark.byTask.utils.StringBufferReader;
|
|||
* <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
|
||||
* parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
|
||||
* <li><b>content.source.encoding</b> - if not specified, ISO-8859-1 is used.
|
||||
* <li><b>content.source.excludeIteration</b> - if true, do not append iteration number to docname
|
||||
* </ul>
|
||||
*/
|
||||
public class TrecContentSource extends ContentSource {
|
||||
|
@ -91,6 +92,7 @@ public class TrecContentSource extends ContentSource {
|
|||
BufferedReader reader;
|
||||
int iteration = 0;
|
||||
HTMLParser htmlParser;
|
||||
private boolean excludeDocnameIteration;
|
||||
|
||||
private DateFormatInfo getDateFormatInfo() {
|
||||
DateFormatInfo dfi = dateFormats.get();
|
||||
|
@ -256,7 +258,8 @@ public class TrecContentSource extends ContentSource {
|
|||
read(docBuf, DOCNO, true, false, null);
|
||||
name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO,
|
||||
DOCNO.length()));
|
||||
name = name + "_" + iteration;
|
||||
if (!excludeDocnameIteration)
|
||||
name = name + "_" + iteration;
|
||||
|
||||
// 3. skip until doc header
|
||||
docBuf.setLength(0);
|
||||
|
@ -342,6 +345,7 @@ public class TrecContentSource extends ContentSource {
|
|||
if (encoding == null) {
|
||||
encoding = "ISO-8859-1";
|
||||
}
|
||||
excludeDocnameIteration = config.get("content.source.excludeIteration", false);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue