LUCENE-836: Add support for search quality benchmarking.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@560372 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Doron Cohen 2007-07-27 20:24:52 +00:00
parent c1496444b2
commit 98fa2d898d
20 changed files with 2469 additions and 6 deletions

View File

@ -284,6 +284,8 @@
</copy>
</target>
<property name="tests.verbose" value="false"/>
<target name="test" depends="compile-test" description="Runs unit tests">
<fail unless="junit.present">
##################################################################
@ -299,6 +301,10 @@
<assertions>
<enable package="org.apache.lucene"/>
</assertions>
<!-- allow tests to control debug prints -->
<sysproperty key="tests.verbose" value="${tests.verbose}"/>
<!-- TODO: create propertyset for test properties, so each project can have its own set -->
<sysproperty key="dataDir" file="src/test"/>
<sysproperty key="tempDir" file="${build.dir}/test"/>

View File

@ -4,6 +4,14 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$
7/27/07
LUCENE-836: Add support for search quality benchmarking, running
a set of queries against a searcher, and, optionally produce a submission
report, and, if query judgements are available, compute quality measures:
recall, precision_at_N, average_precision, MAP. TREC specific Judge (based
on TREC QRels) and TREC Topics reader are included in o.a.l.benchmark.quality.trec
but any other format of queries and judgements can be implemented and used.
7/24/07
LUCENE-947: Add support for creating and index "one document per
line" from a large text file, which reduces per-document overhead of

View File

@ -0,0 +1,53 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.quality;
import java.io.PrintWriter;
/**
* Judge if a document is relevant for a quality query.
*/
public interface Judge {
/**
* Judge if document <code>docName</code> is relevant for the given quality query.
* @param docName name of doc tested for relevancy.
* @param query tested quality query.
* @return true if relevant, false if not.
*/
public boolean isRelevant(String docName, QualityQuery query);
/**
* Validate that queries and this Judge match each other.
* To be perfectly valid, this Judge must have some data for each and every
* input quality query, and must not have any data on any other quality query.
* <b>Note</b>: the quality benchmark run would not fail in case of imperfect
* validity, just a warning message would be logged.
* @param qq quality queries to be validated.
* @param logger if not null, validation issues are logged.
* @return true if perfectly valid, false if not.
*/
public boolean validateData (QualityQuery qq[], PrintWriter logger);
/**
* Return the maximal recall for the input quality query.
* It is the number of relevant docs this Judge "knows" for the query.
* @param query the query whose maximal recall is needed.
*/
public int maxRecall (QualityQuery query);
}

View File

@ -0,0 +1,124 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.quality;
import java.io.IOException;
import java.io.PrintWriter;
import org.apache.lucene.benchmark.quality.utils.DocNameExtractor;
import org.apache.lucene.benchmark.quality.utils.SubmissionReport;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocs;
/**
* Main entry point for running a quality benchmark.
* <p>
* There are two main configurations for running a quality benchmark: <ul>
* <li>Against existing judgements.</li>
* <li>For submission (e.g. for a contest).</li>
* </ul>
* The first configuration requires a non null
* {@link org.apache.lucene.benchmark.quality.Judge Judge}.
* The second configuration requires a non null
* {@link org.apache.lucene.benchmark.quality.utils.SubmissionReport SubmissionLogger}.
*/
public class QualityBenchmark {
/** Quality Queries that this quality benchmark would execute. */
protected QualityQuery qualityQueries[];
/** Parser for turning QualityQueries into Lucene Queries. */
protected QualityQueryParser qqParser;
/** Index to be searched. */
protected Searcher searcher;
/** index field to extract doc name for each search result; used for judging the results. */
protected String docNameField;
/**
* Create a QualityBenchmark.
* @param qqs quality queries to run.
* @param qqParser parser for turning QualityQueries into Lucene Queries.
* @param searcher index to be searched.
* @param docNameField name of field containg the document name.
* This allows to extract the doc name for search results,
* and is important for judging the results.
*/
public QualityBenchmark(QualityQuery qqs[], QualityQueryParser qqParser,
Searcher searcher, String docNameField) {
this.qualityQueries = qqs;
this.qqParser = qqParser;
this.searcher = searcher;
this.docNameField = docNameField;
}
/**
* Run the quality benchmark.
* @param maxResults how many results to collect for each quality query.
* @param judge the judge that can tell if a certain result doc is relevant for a certain quality query.
* If null, no judgements would be made. Usually null for a submission run.
* @param submitRep submission report is created if non null.
* @param qualityLog If not null, quality run data would be printed for each query.
* @return QualityStats of each quality query that was executed.
* @throws Exception if quality benchmark failed to run.
*/
public QualityStats [] execute(int maxResults, Judge judge, SubmissionReport submitRep,
PrintWriter qualityLog) throws Exception {
QualityStats stats[] = new QualityStats[qualityQueries.length];
for (int i=0; i<qualityQueries.length; i++) {
QualityQuery qq = qualityQueries[i];
// generate query
Query q = qqParser.parse(qq);
// search with this query
long t1 = System.currentTimeMillis();
TopDocs td = searcher.search(q,null,maxResults);
long searchTime = System.currentTimeMillis()-t1;
//most likely we either submit or judge, but check both
if (judge!=null) {
stats[i] = analyzeQueryResults(qq, q, td, judge, qualityLog, searchTime);
}
if (submitRep!=null) {
submitRep.report(qq,td,docNameField,searcher);
}
}
return stats;
}
/* Analyze/judge results for a single quality query; optionally log them. */
private QualityStats analyzeQueryResults(QualityQuery qq, Query q, TopDocs td, Judge judge, PrintWriter logger, long searchTime) throws IOException {
QualityStats stts = new QualityStats(judge.maxRecall(qq),searchTime);
ScoreDoc sd[] = td.scoreDocs;
long t1 = System.currentTimeMillis(); // extraction of first doc name we meassure also construction of doc name extractor, just in case.
DocNameExtractor xt = new DocNameExtractor(docNameField);
for (int i=0; i<sd.length; i++) {
String docName = xt.docName(searcher,sd[i].doc);
long docNameExtractTime = System.currentTimeMillis() - t1;
t1 = System.currentTimeMillis();
boolean isRelevant = judge.isRelevant(docName,qq);
stts.addResult(i+1,isRelevant, docNameExtractTime);
}
if (logger!=null) {
logger.println(qq.getQueryID()+" - "+q);
stts.log(qq.getQueryID()+" Stats:",1,logger," ");
}
return stts;
}
}

View File

@ -0,0 +1,87 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.quality;
import java.util.Map;
/**
* A QualityQuery has an ID and some name-value pairs.
* <p>
* The ID allows to map the quality query with its judgements.
* <p>
* The name-value pairs are used by a
* {@link org.apache.lucene.benchmark.quality.QualityQueryParser}
* to create a Lucene {@link org.apache.lucene.search.Query}.
* <p>
* It is very likely that name-value-pairs would be mapped into fields in a Lucene query,
* but it is up to the QualityQueryParser how to map - e.g. all values in a single field,
* or each pair as its own field, etc., - and this of course must match the way the
* searched index was constructed.
*/
public class QualityQuery implements Comparable {
private String queryID;
private Map nameValPairs;
/**
* Create a QualityQuery with given ID and name-value pairs.
* @param queryID ID of this quality query.
* @param nameValPairs the contents of this quality query.
*/
public QualityQuery(String queryID, Map nameValPairs) {
this.queryID = queryID;
this.nameValPairs = nameValPairs;
}
/**
* Return all the names of name-value-pairs in this QualityQuery.
*/
public String[] getNames() {
return (String[]) nameValPairs.keySet().toArray(new String[0]);
}
/**
* Return the value of a certain name-value pair.
* @param name the name whose value should be returned.
*/
public String getValue(String name) {
return (String) nameValPairs.get(name);
}
/**
* Return the ID of this query.
* The ID allows to map the quality query with its judgements.
*/
public String getQueryID() {
return queryID;
}
/* for a nicer sort of input queries before running them.
* Try first as ints, fall back to string if not int. */
public int compareTo(Object o) {
QualityQuery other = (QualityQuery) o;
try {
// compare as ints when ids ints
int n = Integer.parseInt(queryID);
int nOther = Integer.parseInt(other.queryID);
return n - nOther;
} catch (NumberFormatException e) {
// fall back to string comparison
return queryID.compareTo(other.queryID);
}
}
}

View File

@ -0,0 +1,34 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.quality;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.Query;
/**
* Parse a QualityQuery into a Lucene query.
*/
public interface QualityQueryParser {
/**
* Parse a given QualityQuery into a Lucene query.
* @param qq the quality query to be parsed.
* @throws ParseException if parsing failed.
*/
public Query parse(QualityQuery qq) throws ParseException;
}

View File

@ -0,0 +1,266 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.quality;
import java.io.PrintWriter;
import java.text.NumberFormat;
import java.util.ArrayList;
/**
* Results of quality benchmark run for a single query or for a set of queries.
*/
public class QualityStats {
/** Number of points for which precision is computed. */
public static final int MAX_POINTS = 20;
private double maxGoodPoints;
private double recall;
private double pAt[];
private double pReleventSum = 0;
private double numPoints = 0;
private double numGoodPoints = 0;
private long searchTime;
private long docNamesExtractTime;
/**
* A certain rank in which a relevant doc was found.
*/
public static class RecallPoint {
private int rank;
private double recall;
private RecallPoint(int rank, double recall) {
this.rank = rank;
this.recall = recall;
}
/** Returns the rank: where on the list of returned docs this relevant doc appeared. */
public int getRank() {
return rank;
}
/** Returns the recall: how many relevant docs were returned up to this point, inclusive. */
public double getRecall() {
return recall;
}
}
private ArrayList recallPoints;
/**
* Construct a QualityStats object with anticipated maximal number of relevant hits.
* @param maxGoodPoints maximal possible relevant hits.
*/
public QualityStats(double maxGoodPoints, long searchTime) {
this.maxGoodPoints = maxGoodPoints;
this.searchTime = searchTime;
this.recallPoints = new ArrayList();
pAt = new double[MAX_POINTS+1]; // pAt[0] unused.
}
/**
* Add a (possibly relevant) doc.
* @param n rank of the added doc (its ordinal position within the query results).
* @param isRelevant true if the added doc is relevant, false otherwise.
*/
public void addResult(int n, boolean isRelevant, long docNameExtractTime) {
if (Math.abs(numPoints+1 - n) > 1E-6) {
throw new IllegalArgumentException("point "+n+" illegal after "+numPoints+" points!");
}
if (isRelevant) {
numGoodPoints+=1;
recallPoints.add(new RecallPoint(n,numGoodPoints));
}
numPoints = n;
double p = numGoodPoints / numPoints;
if (isRelevant) {
pReleventSum += p;
}
if (n<pAt.length) {
pAt[n] = p;
}
recall = maxGoodPoints<=0 ? p : numGoodPoints/maxGoodPoints;
docNamesExtractTime += docNameExtractTime;
}
/**
* Return the precision at rank n:
* |{relevant hits within first <code>n</code> hits}| / <code>n</code>.
* @param n requested precision point, must be at least 1 and at most {@link #MAX_POINTS}.
*/
public double getPrecisionAt(int n) {
if (n<1 || n>MAX_POINTS) {
throw new IllegalArgumentException("n="+n+" - but it must be in [1,"+MAX_POINTS+"] range!");
}
if (n>numPoints) {
return (numPoints * pAt[(int)numPoints])/n;
}
return pAt[n];
}
/**
* Return the average precision at recall points: sum of precision at recall points / maxGoodPoints.
*/
public double getAvp() {
return maxGoodPoints==0 ? 0 : pReleventSum/maxGoodPoints;
}
/**
* Return the recall: |{relevant hits}| / |{hits}|.
*/
public double getRecall() {
return recall;
}
/**
* Log information on this QualityStats object.
* @param logger Logger.
* @param prefix prefix before each log line.
*/
public void log(String title, int paddLines, PrintWriter logger, String prefix) {
for (int i=0; i<paddLines; i++) {
logger.println();
}
if (title!=null && title.trim().length()>0) {
logger.println(title);
}
prefix = prefix==null ? "" : prefix;
NumberFormat nf = NumberFormat.getInstance();
nf.setMaximumFractionDigits(3);
nf.setMinimumFractionDigits(3);
nf.setGroupingUsed(true);
int M = 19;
logger.println(prefix+format("Search Seconds: ",M)+
fracFormat(nf.format((double)searchTime/1000)));
logger.println(prefix+format("DocName Seconds: ",M)+
fracFormat(nf.format((double)docNamesExtractTime/1000)));
logger.println(prefix+format("Num Points: ",M)+
fracFormat(nf.format(numPoints)));
logger.println(prefix+format("Num Good Points: ",M)+
fracFormat(nf.format(numGoodPoints)));
logger.println(prefix+format("Max Good Points: ",M)+
fracFormat(nf.format(maxGoodPoints)));
logger.println(prefix+format("Average Precision: ",M)+
fracFormat(nf.format(getAvp())));
logger.println(prefix+format("Recall: ",M)+
fracFormat(nf.format(getRecall())));
for (int i=1; i<(int)numPoints && i<pAt.length; i++) {
logger.println(prefix+format("Precision At "+i+": ",M)+
fracFormat(nf.format(getPrecisionAt(i))));
}
for (int i=0; i<paddLines; i++) {
logger.println();
}
}
private static String padd = " ";
private String format(String s, int minLen) {
s = (s==null ? "" : s);
int n = Math.max(minLen,s.length());
return (s+padd).substring(0,n);
}
private String fracFormat(String frac) {
int k = frac.indexOf('.');
String s1 = padd+frac.substring(0,k);
int n = Math.max(k,6);
s1 = s1.substring(s1.length()-n);
return s1 + frac.substring(k);
}
/**
* Create a QualityStats object that is the average of the input QualityStats objects.
* @param stats array of input stats to be averaged.
* @return an average over the input stats.
*/
public static QualityStats average(QualityStats[] stats) {
QualityStats avg = new QualityStats(0,0);
int m = 0; // queries with positive judgements
// aggregate
for (int i=0; i<stats.length; i++) {
avg.searchTime += stats[i].searchTime;
avg.docNamesExtractTime += stats[i].docNamesExtractTime;
if (stats[i].maxGoodPoints>0) {
m++;
avg.numGoodPoints += stats[i].numGoodPoints;
avg.numPoints += stats[i].numPoints;
avg.pReleventSum += stats[i].getAvp();
avg.recall += stats[i].recall;
avg.maxGoodPoints += stats[i].maxGoodPoints;
for (int j=1; j<avg.pAt.length; j++) {
avg.pAt[j] += stats[i].getPrecisionAt(j);
}
}
}
assert m>0 : "Fishy: no \"good\" queries!";
// take average: times go by all queries, other meassures go by "good" queries noly.
avg.searchTime /= stats.length;
avg.docNamesExtractTime /= stats.length;
avg.numGoodPoints /= m;
avg.numPoints /= m;
avg.recall /= m;
avg.maxGoodPoints /= m;
for (int j=1; j<avg.pAt.length; j++) {
avg.pAt[j] /= m;
}
avg.pReleventSum /= m; // this is actually avgp now
avg.pReleventSum *= avg.maxGoodPoints; // so that getAvgP() would be correct
return avg;
}
/**
* Returns the time it took to extract doc names for judging the measured query, in milliseconds.
*/
public long getDocNamesExtractTime() {
return docNamesExtractTime;
}
/**
* Returns the maximal number of good points.
* This is the number of relevant docs known by the judge for the measured query.
*/
public double getMaxGoodPoints() {
return maxGoodPoints;
}
/**
* Returns the number of good points (only relevant points).
*/
public double getNumGoodPoints() {
return numGoodPoints;
}
/**
* Returns the number of points (both relevant and irrelevant points).
*/
public double getNumPoints() {
return numPoints;
}
/**
* Returns the recallPoints.
*/
public RecallPoint [] getRecallPoints() {
return (RecallPoint[]) recallPoints.toArray(new RecallPoint[0]);
}
/**
* Returns the search time in milliseconds for the measured query.
*/
public long getSearchTime() {
return searchTime;
}
}

View File

@ -0,0 +1,65 @@
<html>
<body>
<h2>Search Quality Benchmarking.</h2>
<p>
This package allows to benchmark search quality of a Lucene application.
<p>
In order to use this package you should provide:
<ul>
<li>A <a href="../../search/Searcher.html">searcher</a>.</li>
<li><a href="QualityQuery.html">Quality queries</a>.</li>
<li><a href="Judge.html">Judging object</a>.</li>
<li><a href="utils/SubmissionReport.html">Reporting object</a>.</li>
</ul>
<p>
For benchmarking TREC collections with TREC QRels, take a look at the
<a href="trec/package-summary.html">trec package</a>.
<p>
Here is a sample code used to run the TREC 2006 queries 701-850 on the .Gov2 collection:
<pre>
File topicsFile = new File("topics-701-850.txt");
File qrelsFile = new File("qrels-701-850.txt");
Searcher searcher = new IndexSearcher("index");
int maxResults = 1000;
String docNameField = "docname";
PrintWriter logger = new PrintWriter(System.out,true);
// use trec utilities to read trec topics into quality queries
TrecTopicsReader qReader = new TrecTopicsReader();
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new FileReader(topicsFile)));
// prepare judge, with trec utilities that read from a QRels file
Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile)));
// validate topics & judgments match each other
judge.validateData(qqs, logger);
// set the parsing of quality queries into Lucene queries.
QualityQueryParser qqParser = new SimpleQQParser("title", "body");
// run the benchmark
QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField);
SubmissionReport submitLog = null;
QualityStats stats[] = qrun.execute(maxResults, judge, submitLog, logger);
// print an avarage sum of the results
QualityStats avg = QualityStats.average(stats);
avg.log("SUMMARY",2,logger, " ");
</pre>
<p>
Some immediate ways to modify this program to your needs are:
<ul>
<li>To run on different formats of queries and judgements provide your own
<a href="Judge.html">Judge</a> and
<a href="QualityQuery.html">Quality queries</a>.</li>
<li>Create sophisticated Lucene queries by supplying a different
<a href="QualityQueryParser.html">Quality query parser</a>.</li>
</ul>
</body>
</html>

View File

@ -0,0 +1,158 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.quality.trec;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.lucene.benchmark.quality.Judge;
import org.apache.lucene.benchmark.quality.QualityQuery;
/**
* Judge if given document is relevant to given quality query, based on Trec format for judgements.
*/
public class TrecJudge implements Judge {
HashMap judgements;
/**
* Constructor from a reader.
* <p>
* Expected input format:
* <pre>
* qnum 0 doc-name is-relevant
* </pre>
* Two sample lines:
* <pre>
* 19 0 doc303 1
* 19 0 doc7295 0
* </pre>
* @param reader where judgments are read from.
* @throws IOException
*/
public TrecJudge (BufferedReader reader) throws IOException {
judgements = new HashMap();
QRelJudgement curr = null;
String zero = "0";
String line;
try {
while (null!=(line=reader.readLine())) {
line = line.trim();
if (line.length()==0 || '#'==line.charAt(0)) {
continue;
}
StringTokenizer st = new StringTokenizer(line);
String queryID = st.nextToken();
st.nextToken();
String docName = st.nextToken();
boolean relevant = !zero.equals(st.nextToken());
assert !st.hasMoreTokens() : "wrong format: "+line+" next: "+st.nextToken();
if (relevant) { // only keep relevant docs
if (curr==null || !curr.queryID.equals(queryID)) {
curr = (QRelJudgement)judgements.get(queryID);
if (curr==null) {
curr = new QRelJudgement(queryID);
judgements.put(queryID,curr);
}
}
curr.addRelevandDoc(docName);
}
}
} finally {
reader.close();
}
}
// inherit javadocs
public boolean isRelevant(String docName, QualityQuery query) {
QRelJudgement qrj = (QRelJudgement) judgements.get(query.getQueryID());
return qrj!=null && qrj.isRelevant(docName);
}
/** single Judgement of a trec quality query */
private static class QRelJudgement {
private String queryID;
private HashMap relevantDocs;
QRelJudgement(String queryID) {
this.queryID = queryID;
relevantDocs = new HashMap();
}
public void addRelevandDoc(String docName) {
relevantDocs.put(docName,docName);
}
boolean isRelevant(String docName) {
return relevantDocs.containsKey(docName);
}
public int maxRecall() {
return relevantDocs.size();
}
}
// inherit javadocs
public boolean validateData(QualityQuery[] qq, PrintWriter logger) {
HashMap missingQueries = (HashMap) judgements.clone();
ArrayList missingJudgements = new ArrayList();
for (int i=0; i<qq.length; i++) {
String id = qq[i].getQueryID();
if (missingQueries.containsKey(id)) {
missingQueries.remove(id);
} else {
missingJudgements.add(id);
}
}
boolean isValid = true;
if (missingJudgements.size()>0) {
isValid = false;
if (logger!=null) {
logger.println("WARNING: "+missingJudgements.size()+" queries have no judgments! - ");
for (int i=0; i<missingJudgements.size(); i++) {
logger.println(" "+(String)missingJudgements.get(i));
}
}
}
if (missingQueries.size()>0) {
isValid = false;
if (logger!=null) {
logger.println("WARNING: "+missingQueries.size()+" judgments match no query! - ");
for (Iterator it = missingQueries.keySet().iterator(); it.hasNext();) {
String id = (String) it.next();
logger.println(" "+id);
}
}
}
return isValid;
}
// inherit javadocs
public int maxRecall(QualityQuery query) {
QRelJudgement qrj = (QRelJudgement) judgements.get(query.getQueryID());
if (qrj!=null) {
return qrj.maxRecall();
}
return 0;
}
}

View File

@ -0,0 +1,123 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.quality.trec;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import org.apache.lucene.benchmark.quality.QualityQuery;
/**
* Read TREC topics.
* <p>
* Expects this topic format -
* <pre>
* &lt;top&gt;
* &lt;num&gt; Number: nnn
*
* &lt;title&gt; title of the topic
*
* &lt;desc&gt; Description:
* description of the topic
*
* &lt;narr&gt; Narrative:
* "story" composed by assessors.
*
* &lt;/top&gt;
* </pre>
* Comment lines starting with '#' are ignored.
*/
public class TrecTopicsReader {
private static final String newline = System.getProperty("line.separator");
/**
* Constructor for Trec's TopicsReader
*/
public TrecTopicsReader() {
super();
}
/**
* Read quality queries from trec format topics file.
* @param reader where queries are read from.
* @return the result quality queries.
* @throws IOException if cannot read the queries.
*/
public QualityQuery[] readQueries(BufferedReader reader) throws IOException {
ArrayList res = new ArrayList();
StringBuffer sb;
try {
while (null!=(sb=read(reader,"<top>",null,false,false))) {
HashMap fields = new HashMap();
// id
sb = read(reader,"<num>",null,true,false);
int k = sb.indexOf(":");
String id = sb.substring(k+1).trim();
// title
sb = read(reader,"<title>",null,true,false);
k = sb.indexOf(">");
String title = sb.substring(k+1).trim();
// description
sb = read(reader,"<desc>",null,false,false);
sb = read(reader,"<narr>",null,false,true);
String descripion = sb.toString().trim();
// we got a topic!
fields.put("title",title);
fields.put("description",descripion);
QualityQuery topic = new QualityQuery(id,fields);
res.add(topic);
// skip narrative, get to end of doc
read(reader,"</top>",null,false,false);
}
} finally {
reader.close();
}
// sort result array (by ID)
QualityQuery qq[] = (QualityQuery[]) res.toArray(new QualityQuery[0]);
Arrays.sort(qq);
return qq;
}
// read until finding a line that starts with the specified prefix
private StringBuffer read (BufferedReader reader, String prefix, StringBuffer sb, boolean collectMatchLine, boolean collectAll) throws IOException {
sb = (sb==null ? new StringBuffer() : sb);
String sep = "";
while (true) {
String line = reader.readLine();
if (line==null) {
return null;
}
if (line.startsWith(prefix)) {
if (collectMatchLine) {
sb.append(sep+line);
sep = newline;
}
break;
}
if (collectAll) {
sb.append(sep+line);
sep = newline;
}
}
//System.out.println("read: "+sb);
return sb;
}
}

View File

@ -0,0 +1,6 @@
<html>
<body>
Utilities for Trec related quality benchmarking, feeding from Trec Topics and QRels inputs.
</body>
</html>

View File

@ -0,0 +1,59 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.quality.utils;
import java.io.IOException;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.search.Searcher;
/**
* Utility: extract doc names from an index
*/
public class DocNameExtractor {
private FieldSelector fldSel;
private String docNameField;
/**
* Constructor for DocNameExtractor.
* @param docNameField name of the stored field containing the doc name.
*/
public DocNameExtractor (final String docNameField) {
this.docNameField = docNameField;
fldSel = new FieldSelector() {
public FieldSelectorResult accept(String fieldName) {
return fieldName.equals(docNameField) ?
FieldSelectorResult.LOAD_AND_BREAK :
FieldSelectorResult.NO_LOAD;
}
};
}
/**
* Extract the name of the input doc from the index.
* @param searcher access to the index.
* @param docid ID of doc whose name is needed.
* @return the name of the input doc as extracted from the index.
* @throws IOException if cannot extract the doc name from the index.
*/
public String docName(Searcher searcher, int docid) throws IOException {
return searcher.doc(docid,fldSel).get(docNameField);
}
}

View File

@ -0,0 +1,135 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.quality.utils;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.PriorityQueue;
/**
* Suggest Quality queries based on an index contents.
* Utility class, used for making quality test benchmarks.
*/
public class QualityQueriesFinder {
private static final String newline = System.getProperty("line.separator");
private Directory dir;
/**
* Constrctor over a directory containing the index.
* @param dir directory containing the index we search for the quality test.
*/
private QualityQueriesFinder(Directory dir) {
this.dir = dir;
}
/**
* @param args {index-dir}
* @throws IOException if cannot access the index.
*/
public static void main(String[] args) throws IOException {
if (args.length<1) {
System.err.println("Usage: java QualityQueriesFinder <index-dir>");
System.exit(1);
}
QualityQueriesFinder qqf = new QualityQueriesFinder(FSDirectory.getDirectory(new File(args[0])));
String q[] = qqf.bestQueries("body",20);
for (int i=0; i<q.length; i++) {
System.out.println(newline+formatQueryAsTrecTopic(i,q[i],null,null));
}
}
private String [] bestQueries(String field,int numQueries) throws IOException {
String words[] = bestTerms("body",4*numQueries);
int n = words.length;
int m = n/4;
String res[] = new String[m];
for (int i=0; i<res.length; i++) {
res[i] = words[i] + " " + words[m+i]+ " " + words[n-1-m-i] + " " + words[n-1-i];
//System.out.println("query["+i+"]: "+res[i]);
}
return res;
}
private static String formatQueryAsTrecTopic (int qnum, String title, String description, String narrative) {
return
"<top>" + newline +
"<num> Number: " + qnum + newline + newline +
"<title> " + (title==null?"":title) + newline + newline +
"<desc> Description:" + newline +
(description==null?"":description) + newline + newline +
"<narr> Narrative:" + newline +
(narrative==null?"":narrative) + newline + newline +
"</top>";
}
private String [] bestTerms(String field,int numTerms) throws IOException {
PriorityQueue pq = new TermsDfQueue(numTerms);
IndexReader ir = IndexReader.open(dir);
try {
int threshold = ir.maxDoc() / 10; // ignore words too common.
TermEnum terms = ir.terms(new Term(field,""));
while (terms.next()) {
if (!field.equals(terms.term().field())) {
break;
}
int df = terms.docFreq();
if (df<threshold) {
String ttxt = terms.term().text();
pq.insert(new TermDf(ttxt,df));
}
}
} finally {
ir.close();
}
String res[] = new String[pq.size()];
int i = 0;
while (pq.size()>0) {
TermDf tdf = (TermDf) pq.pop();
res[i++] = tdf.word;
System.out.println(i+". word: "+tdf.df+" "+tdf.word);
}
return res;
}
private static class TermDf {
String word;
int df;
TermDf (String word, int freq) {
this.word = word;
this.df = freq;
}
}
private static class TermsDfQueue extends PriorityQueue {
TermsDfQueue (int maxSize) {
initialize(maxSize);
}
protected boolean lessThan(Object a, Object b) {
TermDf tf1 = (TermDf) a;
TermDf tf2 = (TermDf) b;
return tf1.df < tf2.df;
}
}
}

View File

@ -0,0 +1,58 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.quality.utils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.benchmark.quality.QualityQuery;
import org.apache.lucene.benchmark.quality.QualityQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Query;
/**
* Simplistic quality query parser. A Lucene query is created by passing
* the value of the specified QualityQuery name-value pair into
* a Lucene's QueryParser using StandardAnalyzer. */
public class SimpleQQParser implements QualityQueryParser {
private String qqName;
private String indexField;
ThreadLocal queryParser = new ThreadLocal();
/**
* Constructor of a simple qq parser.
* @param qqName name-value pair of quality query to use for creating the query
* @param indexField corresponding index field
*/
public SimpleQQParser(String qqName, String indexField) {
this.qqName = qqName;
this.indexField = indexField;
}
/* (non-Javadoc)
* @see org.apache.lucene.benchmark.quality.QualityQueryParser#parse(org.apache.lucene.benchmark.quality.QualityQuery)
*/
public Query parse(QualityQuery qq) throws ParseException {
QueryParser qp = (QueryParser) queryParser.get();
if (qp==null) {
qp = new QueryParser(indexField, new StandardAnalyzer());
queryParser.set(qp);
}
return qp.parse(qq.getValue(qqName));
}
}

View File

@ -0,0 +1,83 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.quality.utils;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.NumberFormat;
import org.apache.lucene.benchmark.quality.QualityQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocs;
/**
* Create a log ready for submission.
* Extend this class and override
* {@link #report(QualityQuery, TopDocs, String, Searcher)}
* to create different reports.
*/
public class SubmissionReport {
private NumberFormat nf;
private PrintWriter logger;
/**
* Constructor for SubmissionReport.
* @param logger if null, no submission data is created.
*/
public SubmissionReport (PrintWriter logger) {
this.logger = logger;
nf = NumberFormat.getInstance();
nf.setMaximumFractionDigits(4);
nf.setMinimumFractionDigits(4);
}
/**
* Report a search result for a certain quality query.
* @param qq quality query for which the results are reported.
* @param td search results for the query.
* @param docNameField stored field used for fetching the result doc name.
* @param searcher index access for fetching doc name.
* @throws IOException in case of a problem.
*/
public void report(QualityQuery qq, TopDocs td, String docNameField, Searcher searcher) throws IOException {
if (logger==null) {
return;
}
ScoreDoc sd[] = td.scoreDocs;
String sep = " \t ";
DocNameExtractor xt = new DocNameExtractor(docNameField);
for (int i=0; i<sd.length; i++) {
String docName = xt.docName(searcher,sd[i].doc);
logger.println(
qq.getQueryID() + sep +
'0' + sep +
format(docName,20) + sep +
format(""+i,7) + sep +
nf.format(sd[i].score)
);
}
}
private static String padd = " ";
private String format(String s, int minLen) {
s = (s==null ? "" : s);
int n = Math.max(minLen,s.length());
return (s+padd).substring(0,n);
}
}

View File

@ -0,0 +1,6 @@
<html>
<body>
Miscellaneous utilities for search quality benchmarking: query parsing, submission reports.
</body>
</html>

View File

@ -23,6 +23,9 @@ import java.io.FileReader;
import java.io.BufferedReader;
import org.apache.lucene.benchmark.byTask.Benchmark;
import org.apache.lucene.benchmark.byTask.feeds.DocData;
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker;
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
@ -135,8 +138,8 @@ public class TestPerfTasksLogic extends TestCase {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker",
"doc.add.log.step=2697",
"doc.maker="+Reuters20DocMaker.class.getName(),
"doc.add.log.step=3",
"doc.term.vector=false",
"doc.maker.forever=false",
"directory=FSDirectory",
@ -153,7 +156,7 @@ public class TestPerfTasksLogic extends TestCase {
// 3. test number of docs in the index
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
int ndocsExpected = 21578; // that's how many docs there are in the Reuters collecton.
int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
@ -221,7 +224,7 @@ public class TestPerfTasksLogic extends TestCase {
}
// create the benchmark and execute it.
private Benchmark execBenchmark(String[] algLines) throws Exception {
public static Benchmark execBenchmark(String[] algLines) throws Exception {
String algText = algLinesToText(algLines);
logTstLogic(algText);
Benchmark benchmark = new Benchmark(new StringReader(algText));
@ -230,7 +233,7 @@ public class TestPerfTasksLogic extends TestCase {
}
// catenate alg lines to make the alg text
private String algLinesToText(String[] algLines) {
private static String algLinesToText(String[] algLines) {
String indent = " ";
StringBuffer sb = new StringBuffer();
for (int i = 0; i < propLines.length; i++) {
@ -242,11 +245,22 @@ public class TestPerfTasksLogic extends TestCase {
return sb.toString();
}
private void logTstLogic (String txt) {
private static void logTstLogic (String txt) {
if (!DEBUG)
return;
System.out.println("Test logic of:");
System.out.println(txt);
}
/** use reuters and the exhaust mechanism, but to be faster, add 20 docs only... */
public static class Reuters20DocMaker extends ReutersDocMaker {
private int nDocs=0;
protected DocData getNextDocData() throws Exception {
if (nDocs>=20 && !forever) {
throw new NoMoreDataException();
}
nDocs++;
return super.getNextDocData();
}
}
}

View File

@ -0,0 +1,174 @@
package org.apache.lucene.benchmark.quality;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.PrintWriter;
import org.apache.lucene.benchmark.byTask.TestPerfTasksLogic;
import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker;
import org.apache.lucene.benchmark.quality.Judge;
import org.apache.lucene.benchmark.quality.QualityQuery;
import org.apache.lucene.benchmark.quality.QualityQueryParser;
import org.apache.lucene.benchmark.quality.QualityBenchmark;
import org.apache.lucene.benchmark.quality.trec.TrecJudge;
import org.apache.lucene.benchmark.quality.trec.TrecTopicsReader;
import org.apache.lucene.benchmark.quality.utils.SimpleQQParser;
import org.apache.lucene.benchmark.quality.utils.SubmissionReport;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.FSDirectory;
import junit.framework.TestCase;
/**
* Test that quality run does its job.
*/
public class TestQualityRun extends TestCase {
private static boolean DEBUG = Boolean.getBoolean("tests.verbose");
/**
* @param arg0
*/
public TestQualityRun(String name) {
super(name);
}
public void testTrecQuality() throws Exception {
// first create the complete reuters index
createReutersIndex();
File workDir = new File(System.getProperty("benchmark.work.dir","work"));
assertTrue("Bad workDir: "+workDir, workDir.exists()&& workDir.isDirectory());
int maxResults = 1000;
String docNameField = "docid";
PrintWriter logger = DEBUG ? new PrintWriter(System.out,true) : null;
// <tests src dir> for topics/qrels files - src/test/org/apache/lucene/benchmark/quality
File srcTestDir = new File(new File(new File(new File(new File(
new File(new File(workDir.getAbsoluteFile().getParentFile(),
"src"),"test"),"org"),"apache"),"lucene"),"benchmark"),"quality");
// prepare topics
File topicsFile = new File(srcTestDir, "trecTopics.txt");
assertTrue("Bad topicsFile: "+topicsFile, topicsFile.exists()&& topicsFile.isFile());
TrecTopicsReader qReader = new TrecTopicsReader();
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new FileReader(topicsFile)));
// prepare judge
File qrelsFile = new File(srcTestDir, "trecQRels.txt");
assertTrue("Bad qrelsFile: "+qrelsFile, qrelsFile.exists()&& qrelsFile.isFile());
Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile)));
// validate topics & judgments match each other
judge.validateData(qqs, logger);
IndexSearcher searcher = new IndexSearcher(FSDirectory.getDirectory(new File(workDir,"index")));
QualityQueryParser qqParser = new SimpleQQParser("title","body");
QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField);
SubmissionReport submitLog = DEBUG ? new SubmissionReport(logger) : null;
QualityStats stats[] = qrun.execute(maxResults, judge, submitLog, logger);
// --------- verify by the way judgments were altered for this test:
// for some queries, depending on m = qnum % 8
// m==0: avg_precision and recall are hurt, by marking fake docs as relevant
// m==1: precision_at_n and avg_precision are hurt, by unmarking relevant docs
// m==2: all precision, precision_at_n and recall are hurt.
// m>=3: these queries remain perfect
for (int i = 0; i < stats.length; i++) {
QualityStats s = stats[i];
switch (i%8) {
case 0:
assertTrue("avg-p should be hurt: "+s.getAvp(), 1.0 > s.getAvp());
assertTrue("recall should be hurt: "+s.getRecall(), 1.0 > s.getRecall());
for (int j = 1; j <= QualityStats.MAX_POINTS; j++) {
assertEquals("p_at_"+j+" should be perfect: "+s.getPrecisionAt(j), 1.0, s.getPrecisionAt(j), 1E-9);
}
break;
case 1:
assertTrue("avg-p should be hurt", 1.0 > s.getAvp());
assertEquals("recall should be perfect: "+s.getRecall(), 1.0, s.getRecall(), 1E-9);
for (int j = 1; j <= QualityStats.MAX_POINTS; j++) {
assertTrue("p_at_"+j+" should be hurt: "+s.getPrecisionAt(j), 1.0 > s.getPrecisionAt(j));
}
break;
case 2:
assertTrue("avg-p should be hurt: "+s.getAvp(), 1.0 > s.getAvp());
assertTrue("recall should be hurt: "+s.getRecall(), 1.0 > s.getRecall());
for (int j = 1; j <= QualityStats.MAX_POINTS; j++) {
assertTrue("p_at_"+j+" should be hurt: "+s.getPrecisionAt(j), 1.0 > s.getPrecisionAt(j));
}
break;
default: {
assertEquals("avg-p should be perfect: "+s.getAvp(), 1.0, s.getAvp(), 1E-9);
assertEquals("recall should be perfect: "+s.getRecall(), 1.0, s.getRecall(), 1E-9);
for (int j = 1; j <= QualityStats.MAX_POINTS; j++) {
assertEquals("p_at_"+j+" should be perfect: "+s.getPrecisionAt(j), 1.0, s.getPrecisionAt(j), 1E-9);
}
}
}
}
QualityStats avg = QualityStats.average(stats);
if (logger!=null) {
avg.log("Average statistis:",1,logger," ");
}
assertTrue("mean avg-p should be hurt: "+avg.getAvp(), 1.0 > avg.getAvp());
assertTrue("avg recall should be hurt: "+avg.getRecall(), 1.0 > avg.getRecall());
for (int j = 1; j <= QualityStats.MAX_POINTS; j++) {
assertTrue("avg p_at_"+j+" should be hurt: "+avg.getPrecisionAt(j), 1.0 > avg.getPrecisionAt(j));
}
}
// use benchmark logic to create the full Reuters index
private void createReutersIndex() throws Exception {
// 1. alg definition
String algLines[] = {
"# ----- properties ",
"doc.maker="+ReutersDocMaker.class.getName(),
"doc.add.log.step=2500",
"doc.term.vector=false",
"doc.maker.forever=false",
"directory=FSDirectory",
"doc.stored=true",
"doc.tokenized=true",
"# ----- alg ",
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : *",
"CloseIndex",
};
// 2. execute the algorithm (required in every "logic" test)
TestPerfTasksLogic.execBenchmark(algLines);
}
}

View File

@ -0,0 +1,723 @@
# -----------------------------------------------------------------------
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -----------------------------------------------------------------------
# ------------------------------------------------------------
# Format:
#
# qnum 0 doc-name is-relevant
#
#
# The origin of this file was created using
# utils.QualityQueriesFinder, so all queries
# would have perfect 1.0 for all meassures.
#
# To make it suitable for testing it was modified
# for some queries, depending on m = qnum % 8
# m==0: avg_precision and recall are hurt, by marking fake docs as relevant
# m==1: precision_at_n and avg_precision are hurt, by unmarking relevant docs
# m==2: all precision, precision_at_n and recall are hurt.
# m>=3: these queries remain perfect
# ------------------------------------------------------------
# --- m==0: avg_precision and recall are hurt, by marking fake docs as relevant
0 0 fakedoc1 1
0 0 fakedoc2 1
0 0 fakedoc3 1
0 0 fakedoc4 1
0 0 doc18211 1
0 0 doc20192 1
0 0 doc7401 1
0 0 doc11285 1
0 0 doc20647 1
0 0 doc3057 1
0 0 doc12431 1
0 0 doc4989 1
0 0 doc17324 1
0 0 doc4030 1
0 0 doc4290 1
0 0 doc3462 1
0 0 doc15313 1
0 0 doc10303 1
0 0 doc1893 1
0 0 doc5008 1
0 0 doc14634 1
0 0 doc5471 1
0 0 doc17904 1
0 0 doc7168 1
0 0 doc21275 1
0 0 doc9011 1
0 0 doc17546 1
0 0 doc9102 1
0 0 doc13199 1
# --- m==1: precision_at_n and avg_precision are hurt, by unmarking relevant docs
1 0 doc9857 0
1 0 doc16846 1
1 0 doc4320 1
1 0 doc9501 0
1 0 doc10159 1
1 0 doc16642 1
1 0 doc17536 0
1 0 doc17571 1
1 0 doc18728 1
1 0 doc18828 1
1 0 doc19108 0
1 0 doc9940 1
1 0 doc11852 1
1 0 doc7430 0
1 0 doc19162 1
1 0 doc1743 1
1 0 doc2137 1
1 0 doc7611 1
1 0 doc8072 1
1 0 doc12764 1
1 0 doc2593 1
1 0 doc11088 1
1 0 doc931 1
1 0 doc7673 1
1 0 doc12941 1
1 0 doc11797 1
1 0 doc11831 1
1 0 doc13162 1
1 0 doc4423 1
1 0 doc5217 1
# ---- m==2: all precision, precision_at_n and recall are hurt.
2 0 fakedoc1 1
2 0 fakedoc2 1
2 0 fakedoc3 1
2 0 fakedoc4 1
2 0 doc3137 0
2 0 doc7142 0
2 0 doc13667 0
2 0 doc13171 0
2 0 doc13372 1
2 0 doc21415 1
2 0 doc16298 1
2 0 doc14957 1
2 0 doc153 1
2 0 doc16092 1
2 0 doc16096 1
2 0 doc21303 1
2 0 doc18681 1
2 0 doc20756 1
2 0 doc355 1
2 0 doc13395 1
2 0 doc5009 1
2 0 doc17164 1
2 0 doc13162 1
2 0 doc11757 1
2 0 doc9637 1
2 0 doc18087 1
2 0 doc4593 1
2 0 doc4677 1
2 0 doc20865 1
2 0 doc8556 1
2 0 doc2578 1
2 0 doc1163 1
2 0 doc3797 1
2 0 doc11094 1
3 0 doc19578 1
3 0 doc14860 1
3 0 doc7235 1
3 0 doc20590 1
3 0 doc17933 1
3 0 doc9384 1
3 0 doc10783 1
3 0 doc1963 1
3 0 doc18356 1
3 0 doc13254 1
3 0 doc18402 1
3 0 doc15241 1
3 0 doc3303 1
3 0 doc8868 1
3 0 doc18520 1
3 0 doc4650 1
3 0 doc4727 1
3 0 doc21518 1
3 0 doc5060 1
3 0 doc7587 1
3 0 doc2990 1
3 0 doc8042 1
3 0 doc6304 1
3 0 doc13223 1
3 0 doc1964 1
3 0 doc10597 1
3 0 doc21023 1
3 0 doc19057 1
3 0 doc14948 1
3 0 doc9692 1
4 0 doc2534 1
4 0 doc21388 1
4 0 doc20923 1
4 0 doc11547 1
4 0 doc19755 1
4 0 doc3793 1
4 0 doc6714 1
4 0 doc12722 1
4 0 doc5552 1
4 0 doc6810 1
4 0 doc16953 1
4 0 doc2527 1
4 0 doc5361 1
4 0 doc12353 1
4 0 doc7308 1
4 0 doc3836 1
4 0 doc2293 1
4 0 doc7348 1
4 0 doc17119 1
4 0 doc19331 1
4 0 doc3411 1
4 0 doc14643 1
4 0 doc9058 1
4 0 doc11099 1
4 0 doc12485 1
4 0 doc16432 1
4 0 doc10047 1
4 0 doc13788 1
4 0 doc117 1
4 0 doc638 1
5 0 doc169 1
5 0 doc13181 1
5 0 doc4350 1
5 0 doc10242 1
5 0 doc955 1
5 0 doc5389 1
5 0 doc17122 1
5 0 doc17417 1
5 0 doc12199 1
5 0 doc6918 1
5 0 doc3857 1
5 0 doc2981 1
5 0 doc10639 1
5 0 doc10478 1
5 0 doc8573 1
5 0 doc9197 1
5 0 doc9298 1
5 0 doc2492 1
5 0 doc10262 1
5 0 doc5180 1
5 0 doc11758 1
5 0 doc4065 1
5 0 doc9124 1
5 0 doc11528 1
5 0 doc18879 1
5 0 doc17864 1
5 0 doc3204 1
5 0 doc12157 1
5 0 doc4496 1
5 0 doc20190 1
6 0 doc9507 1
6 0 doc15630 1
6 0 doc8469 1
6 0 doc11918 1
6 0 doc20482 1
6 0 doc20158 1
6 0 doc19831 1
6 0 doc8296 1
6 0 doc8930 1
6 0 doc16460 1
6 0 doc2577 1
6 0 doc15476 1
6 0 doc1767 1
6 0 doc689 1
6 0 doc16606 1
6 0 doc6149 1
6 0 doc18691 1
6 0 doc2208 1
6 0 doc3592 1
6 0 doc11199 1
6 0 doc16329 1
6 0 doc6007 1
6 0 doc15231 1
6 0 doc20622 1
6 0 doc21468 1
6 0 doc12230 1
6 0 doc5723 1
6 0 doc8120 1
6 0 doc8668 1
6 0 doc303 1
7 0 doc7728 1
7 0 doc7693 1
7 0 doc21088 1
7 0 doc5017 1
7 0 doc10807 1
7 0 doc16204 1
7 0 doc2233 1
7 0 doc3632 1
7 0 doc4719 1
7 0 doc6477 1
7 0 doc6502 1
7 0 doc6709 1
7 0 doc7710 1
7 0 doc9193 1
7 0 doc9309 1
7 0 doc9789 1
7 0 doc10971 1
7 0 doc18059 1
7 0 doc19906 1
7 0 doc20089 1
7 0 doc20102 1
7 0 doc21040 1
7 0 doc21153 1
7 0 doc9147 1
7 0 doc9930 1
7 0 doc19763 1
7 0 doc1559 1
7 0 doc21248 1
7 0 doc17945 1
7 0 doc526 1
# --- m==0: avg_precision and recall are hurt, by marking fake docs as relevant
8 0 fakedoc1 1
8 0 fakedoc2 1
8 0 fakedoc3 1
8 0 fakedoc4 1
8 0 doc16299 1
8 0 doc1662 1
8 0 doc4585 1
8 0 doc12315 1
8 0 doc16266 1
8 0 doc13136 1
8 0 doc19212 1
8 0 doc7086 1
8 0 doc7062 1
8 0 doc6134 1
8 0 doc13953 1
8 0 doc16264 1
8 0 doc2494 1
8 0 doc10636 1
8 0 doc10894 1
8 0 doc6844 1
8 0 doc674 1
8 0 doc13520 1
8 0 doc344 1
8 0 doc2896 1
8 0 doc11871 1
8 0 doc1862 1
8 0 doc16728 1
8 0 doc10308 1
8 0 doc2227 1
8 0 doc13167 1
8 0 doc20607 1
8 0 doc9670 1
8 0 doc1566 1
8 0 doc17885 1
# ---- m==1: precision_at_n and avg_precision are hurt, by unmarking relevant docs
9 0 doc1990 0
9 0 doc9342 1
9 0 doc19427 1
9 0 doc12432 0
9 0 doc13480 1
9 0 doc3322 1
9 0 doc16044 1
9 0 doc266 0
9 0 doc3437 1
9 0 doc5370 1
9 0 doc10314 1
9 0 doc4892 1
9 0 doc5763 0
9 0 doc14045 1
9 0 doc1090 1
9 0 doc7437 1
9 0 doc5822 1
9 0 doc4285 1
9 0 doc17119 1
9 0 doc21001 1
9 0 doc4337 1
9 0 doc5967 1
9 0 doc10214 1
9 0 doc12001 1
9 0 doc18553 1
9 0 doc12116 1
9 0 doc5064 1
9 0 doc5018 1
9 0 doc5037 1
9 0 doc8025 1
# ---- m==2: all precision, precision_at_n and recall are hurt.
10 0 fakedoc1 1
10 0 fakedoc2 1
10 0 fakedoc3 1
10 0 fakedoc4 1
10 0 doc17218 0
10 0 doc10270 0
10 0 doc5958 0
10 0 doc19943 0
10 0 doc6510 1
10 0 doc16087 1
10 0 doc14893 1
10 0 doc8933 1
10 0 doc4354 1
10 0 doc16729 1
10 0 doc16761 1
10 0 doc6964 1
10 0 doc16743 1
10 0 doc7357 1
10 0 doc2534 1
10 0 doc18321 1
10 0 doc18497 1
10 0 doc11214 1
10 0 doc11819 1
10 0 doc10818 1
10 0 doc15769 1
10 0 doc5348 1
10 0 doc14948 1
10 0 doc7891 1
10 0 doc9897 1
10 0 doc15559 1
10 0 doc14935 1
10 0 doc14954 1
10 0 doc6621 1
10 0 doc6930 1
11 0 doc11943 1
11 0 doc286 1
11 0 doc1574 1
11 0 doc17916 1
11 0 doc17918 1
11 0 doc19213 1
11 0 doc9337 1
11 0 doc8593 1
11 0 doc8800 1
11 0 doc18580 1
11 0 doc209 1
11 0 doc1893 1
11 0 doc11189 1
11 0 doc17702 1
11 0 doc10180 1
11 0 doc11869 1
11 0 doc9705 1
11 0 doc8715 1
11 0 doc12753 1
11 0 doc10195 1
11 0 doc3552 1
11 0 doc16030 1
11 0 doc4623 1
11 0 doc3188 1
11 0 doc8735 1
11 0 doc151 1
11 0 doc5792 1
11 0 doc5194 1
11 0 doc3393 1
11 0 doc19027 1
12 0 doc18198 1
12 0 doc2444 1
12 0 doc4305 1
12 0 doc6544 1
12 0 doc11639 1
12 0 doc10640 1
12 0 doc12192 1
12 0 doc128 1
12 0 doc10760 1
12 0 doc10881 1
12 0 doc2698 1
12 0 doc3552 1
12 0 doc20524 1
12 0 doc1884 1
12 0 doc9187 1
12 0 doc3131 1
12 0 doc2911 1
12 0 doc2589 1
12 0 doc3747 1
12 0 doc3813 1
12 0 doc5222 1
12 0 doc6023 1
12 0 doc6624 1
12 0 doc7655 1
12 0 doc9205 1
12 0 doc12062 1
12 0 doc15504 1
12 0 doc13625 1
12 0 doc18704 1
12 0 doc2277 1
13 0 doc4948 1
13 0 doc21565 1
13 0 doc17135 1
13 0 doc1866 1
13 0 doc13989 1
13 0 doc5605 1
13 0 doc13431 1
13 0 doc2100 1
13 0 doc16347 1
13 0 doc16894 1
13 0 doc6764 1
13 0 doc8554 1
13 0 doc8695 1
13 0 doc8977 1
13 0 doc19478 1
13 0 doc14595 1
13 0 doc2408 1
13 0 doc2592 1
13 0 doc10947 1
13 0 doc15794 1
13 0 doc5236 1
13 0 doc14847 1
13 0 doc3980 1
13 0 doc1844 1
13 0 doc42 1
13 0 doc7783 1
13 0 doc4557 1
13 0 doc16423 1
13 0 doc17170 1
13 0 doc5822 1
14 0 doc17172 1
14 0 doc17210 1
14 0 doc5044 1
14 0 doc4627 1
14 0 doc4683 1
14 0 doc15126 1
14 0 doc4538 1
14 0 doc273 1
14 0 doc19585 1
14 0 doc16078 1
14 0 doc4529 1
14 0 doc4186 1
14 0 doc12961 1
14 0 doc19217 1
14 0 doc5670 1
14 0 doc1699 1
14 0 doc4716 1
14 0 doc12644 1
14 0 doc18387 1
14 0 doc336 1
14 0 doc16130 1
14 0 doc18718 1
14 0 doc12527 1
14 0 doc11797 1
14 0 doc11831 1
14 0 doc7538 1
14 0 doc17259 1
14 0 doc18724 1
14 0 doc19330 1
14 0 doc19206 1
15 0 doc12198 1
15 0 doc20371 1
15 0 doc2947 1
15 0 doc10750 1
15 0 doc7239 1
15 0 doc14189 1
15 0 doc19474 1
15 0 doc14776 1
15 0 doc21270 1
15 0 doc6387 1
15 0 doc12908 1
15 0 doc9573 1
15 0 doc17102 1
15 0 doc21482 1
15 0 doc6524 1
15 0 doc18034 1
15 0 doc1358 1
15 0 doc13147 1
15 0 doc17731 1
15 0 doc12890 1
15 0 doc20887 1
15 0 doc19508 1
15 0 doc18498 1
15 0 doc20642 1
15 0 doc19878 1
15 0 doc6556 1
15 0 doc10272 1
15 0 doc5720 1
15 0 doc17578 1
15 0 doc17164 1
# --- m==0: avg_precision and recall are hurt, by marking fake docs as relevant
16 0 fakedoc1 1
16 0 fakedoc2 1
16 0 fakedoc3 1
16 0 fakedoc4 1
16 0 doc4043 1
16 0 doc14985 1
16 0 doc15370 1
16 0 doc15426 1
16 0 doc1702 1
16 0 doc3062 1
16 0 doc16134 1
16 0 doc15037 1
16 0 doc8224 1
16 0 doc5044 1
16 0 doc8545 1
16 0 doc7228 1
16 0 doc12686 1
16 0 doc16609 1
16 0 doc13161 1
16 0 doc3446 1
16 0 doc16493 1
16 0 doc19297 1
16 0 doc13619 1
16 0 doc3281 1
16 0 doc15499 1
16 0 doc7373 1
16 0 doc9064 1
16 0 doc1710 1
16 0 doc15411 1
16 0 doc10890 1
16 0 doc3166 1
16 0 doc17894 1
16 0 doc4560 1
16 0 doc12766 1
# --- m==1: precision_at_n and avg_precision are hurt, by unmarking relevant docs
17 0 doc3117 0
17 0 doc7477 0
17 0 doc7569 0
17 0 doc20667 0
17 0 doc20260 1
17 0 doc17355 1
17 0 doc11021 1
17 0 doc20934 1
17 0 doc552 1
17 0 doc20856 1
17 0 doc3524 1
17 0 doc17343 1
17 0 doc21055 1
17 0 doc19032 1
17 0 doc19786 1
17 0 doc9281 1
17 0 doc1695 1
17 0 doc15940 1
17 0 doc9215 1
17 0 doc8335 1
17 0 doc20936 1
17 0 doc6914 1
17 0 doc12122 1
17 0 doc6618 1
17 0 doc5049 1
17 0 doc450 1
17 0 doc19206 1
17 0 doc18823 1
17 0 doc5307 1
17 0 doc17295 1
# ---- m==2: all precision, precision_at_n and recall are hurt.
18 0 fakedoc1 1
18 0 fakedoc2 1
18 0 fakedoc3 1
18 0 fakedoc4 1
18 0 doc8064 0
18 0 doc18142 0
18 0 doc19383 0
18 0 doc21151 0
18 0 doc4665 1
18 0 doc2897 1
18 0 doc6878 1
18 0 doc14507 1
18 0 doc2976 1
18 0 doc11757 1
18 0 doc12625 1
18 0 doc14908 1
18 0 doc12790 1
18 0 doc17915 1
18 0 doc11804 1
18 0 doc12935 1
18 0 doc8225 1
18 0 doc18011 1
18 0 doc10493 1
18 0 doc17922 1
18 0 doc1902 1
18 0 doc14049 1
18 0 doc1334 1
18 0 doc1168 1
18 0 doc4859 1
18 0 doc7124 1
18 0 doc9692 1
18 0 doc18402 1
18 0 doc9089 1
18 0 doc15375 1
19 0 doc5267 1
19 0 doc2310 1
19 0 doc11435 1
19 0 doc15666 1
19 0 doc12733 1
19 0 doc7925 1
19 0 doc2444 1
19 0 doc4900 1
19 0 doc10803 1
19 0 doc8869 1
19 0 doc5051 1
19 0 doc9163 1
19 0 doc529 1
19 0 doc19546 1
19 0 doc18561 1
19 0 doc10634 1
19 0 doc3979 1
19 0 doc8833 1
19 0 doc7652 1
19 0 doc4804 1
19 0 doc12616 1
19 0 doc8419 1
19 0 doc9431 1
19 0 doc16235 1
19 0 doc732 1
19 0 doc2515 1
19 0 doc7194 1
19 0 doc16301 1
19 0 doc4494 1
19 0 doc4496 1

View File

@ -0,0 +1,281 @@
# -----------------------------------------------------------------------
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -----------------------------------------------------------------------
# ------------------------------------------------------------
# This file was created using utils.QualityQueriesFinder.
# See also TrecQRels.txt.
# ------------------------------------------------------------
<top>
<num> Number: 0
<title> statement months total 1987
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 1
<title> agreed 15 against five
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 2
<title> nine only month international
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 3
<title> finance any 10 government
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 4
<title> issue next years all
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 5
<title> who major ltd today
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 6
<title> business revs securities per
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 7
<title> quarter time note sales
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 8
<title> february earlier loss group
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 9
<title> out end made some
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 10
<title> spokesman financial 30 expected
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 11
<title> 1985 now prices due
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 12
<title> before board record could
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 13
<title> pay debt because trade
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 14
<title> meeting increase four price
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 15
<title> chairman rate six interest
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 16
<title> since current between agreement
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 17
<title> oil we when president
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 18
<title> capital through foreign added
<desc> Description:
<narr> Narrative:
</top>
<top>
<num> Number: 19
<title> 20 while common week
<desc> Description:
<narr> Narrative:
</top>