LUCENE-2210: fix TrecTopicsReader for descriptions and narratives

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@899369 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-01-14 19:33:48 +00:00
parent f8bd9819e6
commit 43c11321b4
4 changed files with 80 additions and 12 deletions

View File

@ -4,6 +4,10 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$
1/14/2010
LUCENE-2210: TrecTopicsReader now properly reads descriptions and
narratives from trec topics files. (Robert Muir)
1/11/2010
LUCENE-2181: Add a benchmark for collation. This adds NewLocaleTask,
which sets a Locale in the run data for collation to use, and can be

View File

@ -76,16 +76,31 @@ public class TrecTopicsReader {
k = sb.indexOf(">");
String title = sb.substring(k+1).trim();
// description
sb = read(reader,"<desc>",null,false,false);
sb = read(reader,"<narr>",null,false,true);
String descripion = sb.toString().trim();
read(reader,"<desc>",null,false,false);
sb.setLength(0);
String line = null;
while ((line = reader.readLine()) != null) {
if (line.startsWith("<narr>"))
break;
if (sb.length() > 0) sb.append(' ');
sb.append(line);
}
String description = sb.toString().trim();
// narrative
sb.setLength(0);
while ((line = reader.readLine()) != null) {
if (line.startsWith("</top>"))
break;
if (sb.length() > 0) sb.append(' ');
sb.append(line);
}
String narrative = sb.toString().trim();
// we got a topic!
fields.put("title",title);
fields.put("description",descripion);
fields.put("description",description);
fields.put("narrative", narrative);
QualityQuery topic = new QualityQuery(id,fields);
res.add(topic);
// skip narrative, get to end of doc
read(reader,"</top>",null,false,false);
}
} finally {
reader.close();

View File

@ -149,6 +149,49 @@ public class TestQualityRun extends TestCase {
}
public void testTrecTopicsReader() throws Exception {
File workDir = new File(System.getProperty("benchmark.work.dir","work"));
assertTrue("Bad workDir: " + workDir,
workDir.exists() && workDir.isDirectory());
// <tests src dir> for topics/qrels files:
// src/test/org/apache/lucene/benchmark/quality
File srcTestDir = new File(new File(new File(new File(new File(
new File(new File(workDir.getAbsoluteFile().getParentFile(),
"src"),"test"),"org"),"apache"),"lucene"),"benchmark"),"quality");
// prepare topics
File topicsFile = new File(srcTestDir, "trecTopics.txt");
assertTrue("Bad topicsFile: " + topicsFile,
topicsFile.exists() && topicsFile.isFile());
TrecTopicsReader qReader = new TrecTopicsReader();
QualityQuery qqs[] = qReader.readQueries(
new BufferedReader(new FileReader(topicsFile)));
assertEquals(20, qqs.length);
QualityQuery qq = qqs[0];
assertEquals("statement months total 1987", qq.getValue("title"));
assertEquals("Topic 0 Description Line 1 Topic 0 Description Line 2",
qq.getValue("description"));
assertEquals("Topic 0 Narrative Line 1 Topic 0 Narrative Line 2",
qq.getValue("narrative"));
qq = qqs[1];
assertEquals("agreed 15 against five", qq.getValue("title"));
assertEquals("Topic 1 Description Line 1 Topic 1 Description Line 2",
qq.getValue("description"));
assertEquals("Topic 1 Narrative Line 1 Topic 1 Narrative Line 2",
qq.getValue("narrative"));
qq = qqs[19];
assertEquals("20 while common week", qq.getValue("title"));
assertEquals("Topic 19 Description Line 1 Topic 19 Description Line 2",
qq.getValue("description"));
assertEquals("Topic 19 Narrative Line 1 Topic 19 Narrative Line 2",
qq.getValue("narrative"));
}
// use benchmark logic to create the full Reuters index
private void createReutersIndex() throws Exception {

View File

@ -26,10 +26,12 @@
<title> statement months total 1987
<desc> Description:
Topic 0 Description Line 1
Topic 0 Description Line 2
<narr> Narrative:
Topic 0 Narrative Line 1
Topic 0 Narrative Line 2
</top>
@ -39,10 +41,12 @@
<title> agreed 15 against five
<desc> Description:
Topic 1 Description Line 1
Topic 1 Description Line 2
<narr> Narrative:
Topic 1 Narrative Line 1
Topic 1 Narrative Line 2
</top>
@ -273,9 +277,11 @@
<title> 20 while common week
<desc> Description:
Topic 19 Description Line 1
Topic 19 Description Line 2
<narr> Narrative:
Topic 19 Narrative Line 1
Topic 19 Narrative Line 2
</top>