mirror of https://github.com/apache/lucene.git
LUCENE-2059: allow TrecContentSource not to change the docname
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@835677 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
149a0c9b25
commit
731f418448
|
@ -4,6 +4,13 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
|
||||||
|
|
||||||
$Id:$
|
$Id:$
|
||||||
|
|
||||||
|
11/12/2009
|
||||||
|
LUCENE-2059: allow TrecContentSource not to change the docname.
|
||||||
|
Previously, it would always append the iteration # to the docname.
|
||||||
|
With the new option content.source.excludeIteration, you can disable this.
|
||||||
|
The resulting index can then be used with the quality package to measure
|
||||||
|
relevance. (Robert Muir)
|
||||||
|
|
||||||
11/12/2009
|
11/12/2009
|
||||||
LUCENE-2058: specify trec_eval submission output from the command line.
|
LUCENE-2058: specify trec_eval submission output from the command line.
|
||||||
Previously, 4 arguments were required, but the third was unused. The
|
Previously, 4 arguments were required, but the third was unused. The
|
||||||
|
|
|
@ -48,6 +48,7 @@ import org.apache.lucene.benchmark.byTask.utils.StringBufferReader;
|
||||||
* <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
|
* <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
|
||||||
* parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
|
* parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
|
||||||
* <li><b>content.source.encoding</b> - if not specified, ISO-8859-1 is used.
|
* <li><b>content.source.encoding</b> - if not specified, ISO-8859-1 is used.
|
||||||
|
* <li><b>content.source.excludeIteration</b> - if true, do not append iteration number to docname
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
public class TrecContentSource extends ContentSource {
|
public class TrecContentSource extends ContentSource {
|
||||||
|
@ -91,6 +92,7 @@ public class TrecContentSource extends ContentSource {
|
||||||
BufferedReader reader;
|
BufferedReader reader;
|
||||||
int iteration = 0;
|
int iteration = 0;
|
||||||
HTMLParser htmlParser;
|
HTMLParser htmlParser;
|
||||||
|
private boolean excludeDocnameIteration;
|
||||||
|
|
||||||
private DateFormatInfo getDateFormatInfo() {
|
private DateFormatInfo getDateFormatInfo() {
|
||||||
DateFormatInfo dfi = dateFormats.get();
|
DateFormatInfo dfi = dateFormats.get();
|
||||||
|
@ -256,6 +258,7 @@ public class TrecContentSource extends ContentSource {
|
||||||
read(docBuf, DOCNO, true, false, null);
|
read(docBuf, DOCNO, true, false, null);
|
||||||
name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO,
|
name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO,
|
||||||
DOCNO.length()));
|
DOCNO.length()));
|
||||||
|
if (!excludeDocnameIteration)
|
||||||
name = name + "_" + iteration;
|
name = name + "_" + iteration;
|
||||||
|
|
||||||
// 3. skip until doc header
|
// 3. skip until doc header
|
||||||
|
@ -342,6 +345,7 @@ public class TrecContentSource extends ContentSource {
|
||||||
if (encoding == null) {
|
if (encoding == null) {
|
||||||
encoding = "ISO-8859-1";
|
encoding = "ISO-8859-1";
|
||||||
}
|
}
|
||||||
|
excludeDocnameIteration = config.get("content.source.excludeIteration", false);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue