Rename ranking evaluation `quality_level` to `metric_score` (#32168)

The notion of "quality" is an overloaded term in the search ranking evaluation 
context. Its usually used to decribe certain levels of "good" vs. "bad" of a 
seach result with respect to the users information need. We currently report the 
result of the ranking evaluation as `quality_level` which is a bit missleading.
This changes the response parameter name to `metric_score` which fits better.
This commit is contained in:
Christoph Büscher 2018-07-23 22:25:02 +02:00 committed by GitHub
parent 1b1aa4ecff
commit fe6bb75eb4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 114 additions and 117 deletions

View File

@ -80,7 +80,7 @@ public class RankEvalIT extends ESRestHighLevelClientTestCase {
RankEvalResponse response = execute(rankEvalRequest, highLevelClient()::rankEval, highLevelClient()::rankEvalAsync);
// the expected Prec@ for the first query is 5/7 and the expected Prec@ for the second is 1/7, divided by 2 to get the average
double expectedPrecision = (1.0 / 7.0 + 5.0 / 7.0) / 2.0;
assertEquals(expectedPrecision, response.getEvaluationResult(), Double.MIN_VALUE);
assertEquals(expectedPrecision, response.getMetricScore(), Double.MIN_VALUE);
Map<String, EvalQueryQuality> partialResults = response.getPartialResults();
assertEquals(2, partialResults.size());
EvalQueryQuality amsterdamQueryQuality = partialResults.get("amsterdam_query");

View File

@ -1136,14 +1136,14 @@ public class SearchDocumentationIT extends ESRestHighLevelClientTestCase {
// end::rank-eval-execute
// tag::rank-eval-response
double evaluationResult = response.getEvaluationResult(); // <1>
double evaluationResult = response.getMetricScore(); // <1>
assertEquals(1.0 / 3.0, evaluationResult, 0.0);
Map<String, EvalQueryQuality> partialResults =
response.getPartialResults();
EvalQueryQuality evalQuality =
partialResults.get("kimchy_query"); // <2>
assertEquals("kimchy_query", evalQuality.getId());
double qualityLevel = evalQuality.getQualityLevel(); // <3>
double qualityLevel = evalQuality.metricScore(); // <3>
assertEquals(1.0 / 3.0, qualityLevel, 0.0);
List<RatedSearchHit> hitsAndRatings = evalQuality.getHitsAndRatings();
RatedSearchHit ratedSearchHit = hitsAndRatings.get(2);

View File

@ -270,10 +270,10 @@ that shows potential errors of individual queries. The response has the followin
--------------------------------
{
"rank_eval": {
"quality_level": 0.4, <1>
"metric_score": 0.4, <1>
"details": {
"my_query_id1": { <2>
"quality_level": 0.6, <3>
"metric_score": 0.6, <3>
"unrated_docs": [ <4>
{
"_index": "my_index",
@ -308,7 +308,7 @@ that shows potential errors of individual queries. The response has the followin
<1> the overall evaluation quality calculated by the defined metric
<2> the `details` section contains one entry for every query in the original `requests` section, keyed by the search request id
<3> the `quality_level` in the `details` section shows the contribution of this query to the global quality score
<3> the `metric_score` in the `details` section shows the contribution of this query to the global quality metric score
<4> the `unrated_docs` section contains an `_index` and `_id` entry for each document in the search result for this
query that didn't have a ratings value. This can be used to ask the user to supply ratings for these documents
<5> the `hits` section shows a grouping of the search results with their supplied rating

View File

@ -126,8 +126,6 @@ public class DiscountedCumulativeGain implements EvaluationMetric {
@Override
public EvalQueryQuality evaluate(String taskId, SearchHit[] hits,
List<RatedDocument> ratedDocs) {
List<Integer> allRatings = ratedDocs.stream().mapToInt(RatedDocument::getRating).boxed()
.collect(Collectors.toList());
List<RatedSearchHit> ratedHits = joinHitsWithRatings(hits, ratedDocs);
List<Integer> ratingsInSearchHits = new ArrayList<>(ratedHits.size());
int unratedResults = 0;
@ -144,6 +142,8 @@ public class DiscountedCumulativeGain implements EvaluationMetric {
double idcg = 0;
if (normalize) {
List<Integer> allRatings = ratedDocs.stream().mapToInt(RatedDocument::getRating).boxed()
.collect(Collectors.toList());
Collections.sort(allRatings, Comparator.nullsLast(Collections.reverseOrder()));
idcg = computeDCG(allRatings.subList(0, Math.min(ratingsInSearchHits.size(), allRatings.size())));
if (idcg != 0) {

View File

@ -41,19 +41,19 @@ import java.util.Objects;
public class EvalQueryQuality implements ToXContentFragment, Writeable {
private final String queryId;
private final double evaluationResult;
private final double metricScore;
private MetricDetail optionalMetricDetails;
private final List<RatedSearchHit> ratedHits;
public EvalQueryQuality(String id, double evaluationResult) {
public EvalQueryQuality(String id, double metricScore) {
this.queryId = id;
this.evaluationResult = evaluationResult;
this.metricScore = metricScore;
this.ratedHits = new ArrayList<>();
}
public EvalQueryQuality(StreamInput in) throws IOException {
this.queryId = in.readString();
this.evaluationResult = in.readDouble();
this.metricScore = in.readDouble();
this.ratedHits = in.readList(RatedSearchHit::new);
this.optionalMetricDetails = in.readOptionalNamedWriteable(MetricDetail.class);
}
@ -61,7 +61,7 @@ public class EvalQueryQuality implements ToXContentFragment, Writeable {
// only used for parsing internally
private EvalQueryQuality(String queryId, ParsedEvalQueryQuality builder) {
this.queryId = queryId;
this.evaluationResult = builder.evaluationResult;
this.metricScore = builder.evaluationResult;
this.optionalMetricDetails = builder.optionalMetricDetails;
this.ratedHits = builder.ratedHits;
}
@ -69,7 +69,7 @@ public class EvalQueryQuality implements ToXContentFragment, Writeable {
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(queryId);
out.writeDouble(evaluationResult);
out.writeDouble(metricScore);
out.writeList(ratedHits);
out.writeOptionalNamedWriteable(this.optionalMetricDetails);
}
@ -78,8 +78,8 @@ public class EvalQueryQuality implements ToXContentFragment, Writeable {
return queryId;
}
public double getQualityLevel() {
return evaluationResult;
public double metricScore() {
return metricScore;
}
public void setMetricDetails(MetricDetail breakdown) {
@ -101,7 +101,7 @@ public class EvalQueryQuality implements ToXContentFragment, Writeable {
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(queryId);
builder.field(QUALITY_LEVEL_FIELD.getPreferredName(), this.evaluationResult);
builder.field(METRIC_SCORE_FIELD.getPreferredName(), this.metricScore);
builder.startArray(UNRATED_DOCS_FIELD.getPreferredName());
for (DocumentKey key : EvaluationMetric.filterUnratedDocuments(ratedHits)) {
builder.startObject();
@ -122,7 +122,7 @@ public class EvalQueryQuality implements ToXContentFragment, Writeable {
return builder;
}
private static final ParseField QUALITY_LEVEL_FIELD = new ParseField("quality_level");
static final ParseField METRIC_SCORE_FIELD = new ParseField("metric_score");
private static final ParseField UNRATED_DOCS_FIELD = new ParseField("unrated_docs");
private static final ParseField HITS_FIELD = new ParseField("hits");
private static final ParseField METRIC_DETAILS_FIELD = new ParseField("metric_details");
@ -136,7 +136,7 @@ public class EvalQueryQuality implements ToXContentFragment, Writeable {
}
static {
PARSER.declareDouble((obj, value) -> obj.evaluationResult = value, QUALITY_LEVEL_FIELD);
PARSER.declareDouble((obj, value) -> obj.evaluationResult = value, METRIC_SCORE_FIELD);
PARSER.declareObject((obj, value) -> obj.optionalMetricDetails = value, (p, c) -> parseMetricDetail(p),
METRIC_DETAILS_FIELD);
PARSER.declareObjectArray((obj, list) -> obj.ratedHits = list, (p, c) -> RatedSearchHit.parse(p), HITS_FIELD);
@ -164,13 +164,13 @@ public class EvalQueryQuality implements ToXContentFragment, Writeable {
}
EvalQueryQuality other = (EvalQueryQuality) obj;
return Objects.equals(queryId, other.queryId) &&
Objects.equals(evaluationResult, other.evaluationResult) &&
Objects.equals(metricScore, other.metricScore) &&
Objects.equals(ratedHits, other.ratedHits) &&
Objects.equals(optionalMetricDetails, other.optionalMetricDetails);
}
@Override
public final int hashCode() {
return Objects.hash(queryId, evaluationResult, ratedHits, optionalMetricDetails);
return Objects.hash(queryId, metricScore, ratedHits, optionalMetricDetails);
}
}

View File

@ -39,23 +39,22 @@ import java.util.stream.Collectors;
public interface EvaluationMetric extends ToXContentObject, NamedWriteable {
/**
* Returns a single metric representing the ranking quality of a set of returned
* documents wrt. to a set of document ids labeled as relevant for this search.
* Evaluates a single ranking evaluation case.
*
* @param taskId
* the id of the query for which the ranking is currently evaluated
* an identifier of the query for which the search ranking is
* evaluated
* @param hits
* the result hits as returned by a search request
* the search result hits
* @param ratedDocs
* the documents that were ranked by human annotators for this query
* case
* @return some metric representing the quality of the result hit list wrt. to
* relevant doc ids.
* the documents that contain the document rating for this query case
* @return an {@link EvalQueryQuality} instance that contains the metric score
* with respect to the provided search hits and ratings
*/
EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);
/**
* join hits with rated documents using the joint _index/_id document key
* Joins hits with rated documents using the joint _index/_id document key.
*/
static List<RatedSearchHit> joinHitsWithRatings(SearchHit[] hits, List<RatedDocument> ratedDocs) {
Map<DocumentKey, RatedDocument> ratedDocumentMap = ratedDocs.stream()
@ -74,7 +73,7 @@ public interface EvaluationMetric extends ToXContentObject, NamedWriteable {
}
/**
* filter @link {@link RatedSearchHit} that don't have a rating
* Filter {@link RatedSearchHit}s that do not have a rating.
*/
static List<DocumentKey> filterUnratedDocuments(List<RatedSearchHit> ratedHits) {
return ratedHits.stream().filter(hit -> hit.getRating().isPresent() == false)
@ -82,11 +81,11 @@ public interface EvaluationMetric extends ToXContentObject, NamedWriteable {
}
/**
* how evaluation metrics for particular search queries get combined for the overall evaluation score.
* Defaults to averaging over the partial results.
* Combine several {@link EvalQueryQuality} results into the overall evaluation score.
* This defaults to averaging over the partial results, but can be overwritten to obtain a different behavior.
*/
default double combine(Collection<EvalQueryQuality> partialResults) {
return partialResults.stream().mapToDouble(EvalQueryQuality::getQualityLevel).sum() / partialResults.size();
return partialResults.stream().mapToDouble(EvalQueryQuality::metricScore).sum() / partialResults.size();
}
/**

View File

@ -110,8 +110,7 @@ public class MeanReciprocalRank implements EvaluationMetric {
* Compute ReciprocalRank based on provided relevant document IDs.
**/
@Override
public EvalQueryQuality evaluate(String taskId, SearchHit[] hits,
List<RatedDocument> ratedDocs) {
public EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs) {
List<RatedSearchHit> ratedHits = joinHitsWithRatings(hits, ratedDocs);
int firstRelevant = -1;
int rank = 1;

View File

@ -48,15 +48,15 @@ import java.util.stream.Collectors;
public class RankEvalResponse extends ActionResponse implements ToXContentObject {
/** The overall evaluation result. */
private double evaluationResult;
private double metricScore;
/** details about individual ranking evaluation queries, keyed by their id */
private Map<String, EvalQueryQuality> details;
/** exceptions for specific ranking evaluation queries, keyed by their id */
private Map<String, Exception> failures;
public RankEvalResponse(double qualityLevel, Map<String, EvalQueryQuality> partialResults,
public RankEvalResponse(double metricScore, Map<String, EvalQueryQuality> partialResults,
Map<String, Exception> failures) {
this.evaluationResult = qualityLevel;
this.metricScore = metricScore;
this.details = new HashMap<>(partialResults);
this.failures = new HashMap<>(failures);
}
@ -65,8 +65,8 @@ public class RankEvalResponse extends ActionResponse implements ToXContentObject
// only used in RankEvalAction#newResponse()
}
public double getEvaluationResult() {
return evaluationResult;
public double getMetricScore() {
return metricScore;
}
public Map<String, EvalQueryQuality> getPartialResults() {
@ -85,7 +85,7 @@ public class RankEvalResponse extends ActionResponse implements ToXContentObject
@Override
public void writeTo(StreamOutput out) throws IOException {
super.writeTo(out);
out.writeDouble(evaluationResult);
out.writeDouble(metricScore);
out.writeVInt(details.size());
for (String queryId : details.keySet()) {
out.writeString(queryId);
@ -101,7 +101,7 @@ public class RankEvalResponse extends ActionResponse implements ToXContentObject
@Override
public void readFrom(StreamInput in) throws IOException {
super.readFrom(in);
this.evaluationResult = in.readDouble();
this.metricScore = in.readDouble();
int partialResultSize = in.readVInt();
this.details = new HashMap<>(partialResultSize);
for (int i = 0; i < partialResultSize; i++) {
@ -120,7 +120,7 @@ public class RankEvalResponse extends ActionResponse implements ToXContentObject
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
builder.field("quality_level", evaluationResult);
builder.field("metric_score", metricScore);
builder.startObject("details");
for (String key : details.keySet()) {
details.get(key).toXContent(builder, params);
@ -137,7 +137,6 @@ public class RankEvalResponse extends ActionResponse implements ToXContentObject
return builder;
}
private static final ParseField QUALITY_LEVEL_FIELD = new ParseField("quality_level");
private static final ParseField DETAILS_FIELD = new ParseField("details");
private static final ParseField FAILURES_FIELD = new ParseField("failures");
@SuppressWarnings("unchecked")
@ -147,7 +146,7 @@ public class RankEvalResponse extends ActionResponse implements ToXContentObject
((List<EvalQueryQuality>) a[1]).stream().collect(Collectors.toMap(EvalQueryQuality::getId, Function.identity())),
((List<Tuple<String, Exception>>) a[2]).stream().collect(Collectors.toMap(Tuple::v1, Tuple::v2))));
static {
PARSER.declareDouble(ConstructingObjectParser.constructorArg(), QUALITY_LEVEL_FIELD);
PARSER.declareDouble(ConstructingObjectParser.constructorArg(), EvalQueryQuality.METRIC_SCORE_FIELD);
PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(), (p, c, n) -> EvalQueryQuality.fromXContent(p, n),
DETAILS_FIELD);
PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(), (p, c, n) -> {

View File

@ -76,7 +76,7 @@ public class DiscountedCumulativeGainTests extends ESTestCase {
hits[i].shard(new SearchShardTarget("testnode", new Index("index", "uuid"), 0, null));
}
DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
assertEquals(EXPECTED_DCG, dcg.evaluate("id", hits, rated).getQualityLevel(), DELTA);
assertEquals(EXPECTED_DCG, dcg.evaluate("id", hits, rated).metricScore(), DELTA);
/**
* Check with normalization: to get the maximal possible dcg, sort documents by
@ -94,7 +94,7 @@ public class DiscountedCumulativeGainTests extends ESTestCase {
* idcg = 14.595390756454922 (sum of last column)
*/
dcg = new DiscountedCumulativeGain(true, null, 10);
assertEquals(EXPECTED_NDCG, dcg.evaluate("id", hits, rated).getQualityLevel(), DELTA);
assertEquals(EXPECTED_NDCG, dcg.evaluate("id", hits, rated).metricScore(), DELTA);
}
/**
@ -127,7 +127,7 @@ public class DiscountedCumulativeGainTests extends ESTestCase {
}
DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
EvalQueryQuality result = dcg.evaluate("id", hits, rated);
assertEquals(12.779642067948913, result.getQualityLevel(), DELTA);
assertEquals(12.779642067948913, result.metricScore(), DELTA);
assertEquals(2, filterUnratedDocuments(result.getHitsAndRatings()).size());
/**
@ -146,7 +146,7 @@ public class DiscountedCumulativeGainTests extends ESTestCase {
* idcg = 13.347184833073591 (sum of last column)
*/
dcg = new DiscountedCumulativeGain(true, null, 10);
assertEquals(12.779642067948913 / 13.347184833073591, dcg.evaluate("id", hits, rated).getQualityLevel(), DELTA);
assertEquals(12.779642067948913 / 13.347184833073591, dcg.evaluate("id", hits, rated).metricScore(), DELTA);
}
/**
@ -184,7 +184,7 @@ public class DiscountedCumulativeGainTests extends ESTestCase {
}
DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
EvalQueryQuality result = dcg.evaluate("id", hits, ratedDocs);
assertEquals(12.392789260714371, result.getQualityLevel(), DELTA);
assertEquals(12.392789260714371, result.metricScore(), DELTA);
assertEquals(1, filterUnratedDocuments(result.getHitsAndRatings()).size());
/**
@ -204,7 +204,7 @@ public class DiscountedCumulativeGainTests extends ESTestCase {
* idcg = 13.347184833073591 (sum of last column)
*/
dcg = new DiscountedCumulativeGain(true, null, 10);
assertEquals(12.392789260714371 / 13.347184833073591, dcg.evaluate("id", hits, ratedDocs).getQualityLevel(), DELTA);
assertEquals(12.392789260714371 / 13.347184833073591, dcg.evaluate("id", hits, ratedDocs).metricScore(), DELTA);
}
/**
@ -223,13 +223,13 @@ public class DiscountedCumulativeGainTests extends ESTestCase {
SearchHit[] hits = new SearchHit[0];
DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
EvalQueryQuality result = dcg.evaluate("id", hits, ratedDocs);
assertEquals(0.0d, result.getQualityLevel(), DELTA);
assertEquals(0.0d, result.metricScore(), DELTA);
assertEquals(0, filterUnratedDocuments(result.getHitsAndRatings()).size());
// also check normalized
dcg = new DiscountedCumulativeGain(true, null, 10);
result = dcg.evaluate("id", hits, ratedDocs);
assertEquals(0.0d, result.getQualityLevel(), DELTA);
assertEquals(0.0d, result.metricScore(), DELTA);
assertEquals(0, filterUnratedDocuments(result.getHitsAndRatings()).size());
}

View File

@ -129,7 +129,7 @@ public class EvalQueryQualityTests extends ESTestCase {
private static EvalQueryQuality mutateTestItem(EvalQueryQuality original) {
String id = original.getId();
double qualityLevel = original.getQualityLevel();
double metricScore = original.metricScore();
List<RatedSearchHit> ratedHits = new ArrayList<>(original.getHitsAndRatings());
MetricDetail metricDetails = original.getMetricDetails();
switch (randomIntBetween(0, 3)) {
@ -137,7 +137,7 @@ public class EvalQueryQualityTests extends ESTestCase {
id = id + "_";
break;
case 1:
qualityLevel = qualityLevel + 0.1;
metricScore = metricScore + 0.1;
break;
case 2:
if (metricDetails == null) {
@ -152,7 +152,7 @@ public class EvalQueryQualityTests extends ESTestCase {
default:
throw new IllegalStateException("The test should only allow four parameters mutated");
}
EvalQueryQuality evalQueryQuality = new EvalQueryQuality(id, qualityLevel);
EvalQueryQuality evalQueryQuality = new EvalQueryQuality(id, metricScore);
evalQueryQuality.setMetricDetails(metricDetails);
evalQueryQuality.addHitsAndRatings(ratedHits);
return evalQueryQuality;

View File

@ -76,10 +76,10 @@ public class ExpectedReciprocalRankTests extends ESTestCase {
Integer[] relevanceRatings = new Integer[] { 3, 2, 0, 1};
SearchHit[] hits = createSearchHits(rated, relevanceRatings);
ExpectedReciprocalRank err = new ExpectedReciprocalRank(3, 0, 3);
assertEquals(0.8984375, err.evaluate("id", hits, rated).getQualityLevel(), DELTA);
assertEquals(0.8984375, err.evaluate("id", hits, rated).metricScore(), DELTA);
// take 4th rank into window
err = new ExpectedReciprocalRank(3, 0, 4);
assertEquals(0.8984375 + 0.00244140625, err.evaluate("id", hits, rated).getQualityLevel(), DELTA);
assertEquals(0.8984375 + 0.00244140625, err.evaluate("id", hits, rated).metricScore(), DELTA);
}
/**
@ -102,11 +102,11 @@ public class ExpectedReciprocalRankTests extends ESTestCase {
SearchHit[] hits = createSearchHits(rated, relevanceRatings);
ExpectedReciprocalRank err = new ExpectedReciprocalRank(3, null, 4);
EvalQueryQuality evaluation = err.evaluate("id", hits, rated);
assertEquals(0.875 + 0.00390625, evaluation.getQualityLevel(), DELTA);
assertEquals(0.875 + 0.00390625, evaluation.metricScore(), DELTA);
assertEquals(1, ((ExpectedReciprocalRank.Detail) evaluation.getMetricDetails()).getUnratedDocs());
// if we supply e.g. 2 as unknown docs rating, it should be the same as in the other test above
err = new ExpectedReciprocalRank(3, 2, 4);
assertEquals(0.8984375 + 0.00244140625, err.evaluate("id", hits, rated).getQualityLevel(), DELTA);
assertEquals(0.8984375 + 0.00244140625, err.evaluate("id", hits, rated).metricScore(), DELTA);
}
private SearchHit[] createSearchHits(List<RatedDocument> rated, Integer[] relevanceRatings) {
@ -126,7 +126,7 @@ public class ExpectedReciprocalRankTests extends ESTestCase {
*/
public void testNoResults() throws Exception {
ExpectedReciprocalRank err = new ExpectedReciprocalRank(5, 0, 10);
assertEquals(0.0, err.evaluate("id", new SearchHit[0], Collections.emptyList()).getQualityLevel(), DELTA);
assertEquals(0.0, err.evaluate("id", new SearchHit[0], Collections.emptyList()).metricScore(), DELTA);
}
public void testParseFromXContent() throws IOException {

View File

@ -95,14 +95,14 @@ public class MeanReciprocalRankTests extends ESTestCase {
int rankAtFirstRelevant = relevantAt + 1;
EvalQueryQuality evaluation = reciprocalRank.evaluate("id", hits, ratedDocs);
assertEquals(1.0 / rankAtFirstRelevant, evaluation.getQualityLevel(), Double.MIN_VALUE);
assertEquals(1.0 / rankAtFirstRelevant, evaluation.metricScore(), Double.MIN_VALUE);
assertEquals(rankAtFirstRelevant, ((MeanReciprocalRank.Detail) evaluation.getMetricDetails()).getFirstRelevantRank());
// check that if we have fewer search hits than relevant doc position,
// we don't find any result and get 0.0 quality level
// we don't find any result and get 0.0 score
reciprocalRank = new MeanReciprocalRank();
evaluation = reciprocalRank.evaluate("id", Arrays.copyOfRange(hits, 0, relevantAt), ratedDocs);
assertEquals(0.0, evaluation.getQualityLevel(), Double.MIN_VALUE);
assertEquals(0.0, evaluation.metricScore(), Double.MIN_VALUE);
}
public void testEvaluationOneRelevantInResults() {
@ -120,7 +120,7 @@ public class MeanReciprocalRankTests extends ESTestCase {
}
EvalQueryQuality evaluation = reciprocalRank.evaluate("id", hits, ratedDocs);
assertEquals(1.0 / (relevantAt + 1), evaluation.getQualityLevel(), Double.MIN_VALUE);
assertEquals(1.0 / (relevantAt + 1), evaluation.metricScore(), Double.MIN_VALUE);
assertEquals(relevantAt + 1, ((MeanReciprocalRank.Detail) evaluation.getMetricDetails()).getFirstRelevantRank());
}
@ -140,7 +140,7 @@ public class MeanReciprocalRankTests extends ESTestCase {
MeanReciprocalRank reciprocalRank = new MeanReciprocalRank(2, 10);
EvalQueryQuality evaluation = reciprocalRank.evaluate("id", hits, rated);
assertEquals((double) 1 / 3, evaluation.getQualityLevel(), 0.00001);
assertEquals((double) 1 / 3, evaluation.metricScore(), 0.00001);
assertEquals(3, ((MeanReciprocalRank.Detail) evaluation.getMetricDetails()).getFirstRelevantRank());
}
@ -158,13 +158,13 @@ public class MeanReciprocalRankTests extends ESTestCase {
SearchHit[] hits = createSearchHits(0, 9, "test");
List<RatedDocument> ratedDocs = new ArrayList<>();
EvalQueryQuality evaluation = reciprocalRank.evaluate("id", hits, ratedDocs);
assertEquals(0.0, evaluation.getQualityLevel(), Double.MIN_VALUE);
assertEquals(0.0, evaluation.metricScore(), Double.MIN_VALUE);
}
public void testNoResults() throws Exception {
SearchHit[] hits = new SearchHit[0];
EvalQueryQuality evaluated = (new MeanReciprocalRank()).evaluate("id", hits, Collections.emptyList());
assertEquals(0.0d, evaluated.getQualityLevel(), 0.00001);
assertEquals(0.0d, evaluated.metricScore(), 0.00001);
assertEquals(-1, ((MeanReciprocalRank.Detail) evaluated.getMetricDetails()).getFirstRelevantRank());
}

View File

@ -53,7 +53,7 @@ public class PrecisionAtKTests extends ESTestCase {
List<RatedDocument> rated = new ArrayList<>();
rated.add(createRatedDoc("test", "0", RELEVANT_RATING_1));
EvalQueryQuality evaluated = (new PrecisionAtK()).evaluate("id", toSearchHits(rated, "test"), rated);
assertEquals(1, evaluated.getQualityLevel(), 0.00001);
assertEquals(1, evaluated.metricScore(), 0.00001);
assertEquals(1, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRelevantRetrieved());
assertEquals(1, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRetrieved());
}
@ -66,7 +66,7 @@ public class PrecisionAtKTests extends ESTestCase {
rated.add(createRatedDoc("test", "3", RELEVANT_RATING_1));
rated.add(createRatedDoc("test", "4", IRRELEVANT_RATING_0));
EvalQueryQuality evaluated = (new PrecisionAtK()).evaluate("id", toSearchHits(rated, "test"), rated);
assertEquals((double) 4 / 5, evaluated.getQualityLevel(), 0.00001);
assertEquals((double) 4 / 5, evaluated.metricScore(), 0.00001);
assertEquals(4, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRelevantRetrieved());
assertEquals(5, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRetrieved());
}
@ -85,7 +85,7 @@ public class PrecisionAtKTests extends ESTestCase {
rated.add(createRatedDoc("test", "4", 4));
PrecisionAtK precisionAtN = new PrecisionAtK(2, false, 5);
EvalQueryQuality evaluated = precisionAtN.evaluate("id", toSearchHits(rated, "test"), rated);
assertEquals((double) 3 / 5, evaluated.getQualityLevel(), 0.00001);
assertEquals((double) 3 / 5, evaluated.metricScore(), 0.00001);
assertEquals(3, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRelevantRetrieved());
assertEquals(5, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRetrieved());
}
@ -99,7 +99,7 @@ public class PrecisionAtKTests extends ESTestCase {
rated.add(createRatedDoc("test", "2", IRRELEVANT_RATING_0));
// the following search hits contain only the last three documents
EvalQueryQuality evaluated = (new PrecisionAtK()).evaluate("id", toSearchHits(rated.subList(2, 5), "test"), rated);
assertEquals((double) 2 / 3, evaluated.getQualityLevel(), 0.00001);
assertEquals((double) 2 / 3, evaluated.metricScore(), 0.00001);
assertEquals(2, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRelevantRetrieved());
assertEquals(3, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRetrieved());
}
@ -114,14 +114,14 @@ public class PrecisionAtKTests extends ESTestCase {
searchHits[2].shard(new SearchShardTarget("testnode", new Index("index", "uuid"), 0, null));
EvalQueryQuality evaluated = (new PrecisionAtK()).evaluate("id", searchHits, rated);
assertEquals((double) 2 / 3, evaluated.getQualityLevel(), 0.00001);
assertEquals((double) 2 / 3, evaluated.metricScore(), 0.00001);
assertEquals(2, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRelevantRetrieved());
assertEquals(3, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRetrieved());
// also try with setting `ignore_unlabeled`
PrecisionAtK prec = new PrecisionAtK(1, true, 10);
evaluated = prec.evaluate("id", searchHits, rated);
assertEquals((double) 2 / 2, evaluated.getQualityLevel(), 0.00001);
assertEquals((double) 2 / 2, evaluated.metricScore(), 0.00001);
assertEquals(2, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRelevantRetrieved());
assertEquals(2, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRetrieved());
}
@ -133,14 +133,14 @@ public class PrecisionAtKTests extends ESTestCase {
hits[i].shard(new SearchShardTarget("testnode", new Index("index", "uuid"), 0, null));
}
EvalQueryQuality evaluated = (new PrecisionAtK()).evaluate("id", hits, Collections.emptyList());
assertEquals(0.0d, evaluated.getQualityLevel(), 0.00001);
assertEquals(0.0d, evaluated.metricScore(), 0.00001);
assertEquals(0, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRelevantRetrieved());
assertEquals(5, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRetrieved());
// also try with setting `ignore_unlabeled`
PrecisionAtK prec = new PrecisionAtK(1, true, 10);
evaluated = prec.evaluate("id", hits, Collections.emptyList());
assertEquals(0.0d, evaluated.getQualityLevel(), 0.00001);
assertEquals(0.0d, evaluated.metricScore(), 0.00001);
assertEquals(0, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRelevantRetrieved());
assertEquals(0, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRetrieved());
}
@ -148,7 +148,7 @@ public class PrecisionAtKTests extends ESTestCase {
public void testNoResults() throws Exception {
SearchHit[] hits = new SearchHit[0];
EvalQueryQuality evaluated = (new PrecisionAtK()).evaluate("id", hits, Collections.emptyList());
assertEquals(0.0d, evaluated.getQualityLevel(), 0.00001);
assertEquals(0.0d, evaluated.metricScore(), 0.00001);
assertEquals(0, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRelevantRetrieved());
assertEquals(0, ((PrecisionAtK.Detail) evaluated.getMetricDetails()).getRetrieved());
}

View File

@ -114,7 +114,7 @@ public class RankEvalRequestIT extends ESIntegTestCase {
// the expected Prec@ for the first query is 4/6 and the expected Prec@ for the
// second is 1/6, divided by 2 to get the average
double expectedPrecision = (1.0 / 6.0 + 4.0 / 6.0) / 2.0;
assertEquals(expectedPrecision, response.getEvaluationResult(), Double.MIN_VALUE);
assertEquals(expectedPrecision, response.getMetricScore(), Double.MIN_VALUE);
Set<Entry<String, EvalQueryQuality>> entrySet = response.getPartialResults().entrySet();
assertEquals(2, entrySet.size());
for (Entry<String, EvalQueryQuality> entry : entrySet) {
@ -157,7 +157,7 @@ public class RankEvalRequestIT extends ESIntegTestCase {
// if we look only at top 3 documente, the expected P@3 for the first query is
// 2/3 and the expected Prec@ for the second is 1/3, divided by 2 to get the average
expectedPrecision = (1.0 / 3.0 + 2.0 / 3.0) / 2.0;
assertEquals(expectedPrecision, response.getEvaluationResult(), Double.MIN_VALUE);
assertEquals(expectedPrecision, response.getMetricScore(), Double.MIN_VALUE);
}
/**
@ -186,7 +186,7 @@ public class RankEvalRequestIT extends ESIntegTestCase {
new RankEvalRequest(task, new String[] { TEST_INDEX }));
RankEvalResponse response = client().execute(RankEvalAction.INSTANCE, builder.request()).actionGet();
assertEquals(DiscountedCumulativeGainTests.EXPECTED_DCG, response.getEvaluationResult(), 10E-14);
assertEquals(DiscountedCumulativeGainTests.EXPECTED_DCG, response.getMetricScore(), 10E-14);
// test that a different window size k affects the result
metric = new DiscountedCumulativeGain(false, null, 3);
@ -195,7 +195,7 @@ public class RankEvalRequestIT extends ESIntegTestCase {
builder = new RankEvalRequestBuilder(client(), RankEvalAction.INSTANCE, new RankEvalRequest(task, new String[] { TEST_INDEX }));
response = client().execute(RankEvalAction.INSTANCE, builder.request()).actionGet();
assertEquals(12.39278926071437, response.getEvaluationResult(), 10E-14);
assertEquals(12.39278926071437, response.getMetricScore(), 10E-14);
}
public void testMRRRequest() {
@ -218,7 +218,7 @@ public class RankEvalRequestIT extends ESIntegTestCase {
// the expected reciprocal rank for the berlin_query is 1/1
// dividing by 2 to get the average
double expectedMRR = (1.0 + 1.0 / 5.0) / 2.0;
assertEquals(expectedMRR, response.getEvaluationResult(), 0.0);
assertEquals(expectedMRR, response.getMetricScore(), 0.0);
// test that a different window size k affects the result
metric = new MeanReciprocalRank(1, 3);
@ -231,7 +231,7 @@ public class RankEvalRequestIT extends ESIntegTestCase {
// the reciprocal rank for the berlin_query is 1/1
// dividing by 2 to get the average
expectedMRR = 1.0 / 2.0;
assertEquals(expectedMRR, response.getEvaluationResult(), 0.0);
assertEquals(expectedMRR, response.getMetricScore(), 0.0);
}
/**

View File

@ -102,7 +102,7 @@ public class RankEvalResponseTests extends ESTestCase {
try (StreamInput in = output.bytes().streamInput()) {
RankEvalResponse deserializedResponse = new RankEvalResponse();
deserializedResponse.readFrom(in);
assertEquals(randomResponse.getEvaluationResult(), deserializedResponse.getEvaluationResult(), Double.MIN_VALUE);
assertEquals(randomResponse.getMetricScore(), deserializedResponse.getMetricScore(), Double.MIN_VALUE);
assertEquals(randomResponse.getPartialResults(), deserializedResponse.getPartialResults());
assertEquals(randomResponse.getFailures().keySet(), deserializedResponse.getFailures().keySet());
assertNotSame(randomResponse, deserializedResponse);
@ -130,7 +130,7 @@ public class RankEvalResponseTests extends ESTestCase {
assertNotSame(testItem, parsedItem);
// We cannot check equality of object here because some information (e.g.
// SearchHit#shard) cannot fully be parsed back.
assertEquals(testItem.getEvaluationResult(), parsedItem.getEvaluationResult(), 0.0);
assertEquals(testItem.getMetricScore(), parsedItem.getMetricScore(), 0.0);
assertEquals(testItem.getPartialResults().keySet(), parsedItem.getPartialResults().keySet());
for (EvalQueryQuality metricDetail : testItem.getPartialResults().values()) {
EvalQueryQuality parsedEvalQueryQuality = parsedItem.getPartialResults().get(metricDetail.getId());
@ -154,10 +154,10 @@ public class RankEvalResponseTests extends ESTestCase {
XContentBuilder builder = XContentFactory.contentBuilder(XContentType.JSON);
String xContent = BytesReference.bytes(response.toXContent(builder, ToXContent.EMPTY_PARAMS)).utf8ToString();
assertEquals(("{" +
" \"quality_level\": 0.123," +
" \"metric_score\": 0.123," +
" \"details\": {" +
" \"coffee_query\": {" +
" \"quality_level\": 0.1," +
" \"metric_score\": 0.1," +
" \"unrated_docs\": [{\"_index\":\"index\",\"_id\":\"456\"}]," +
" \"hits\":[{\"hit\":{\"_index\":\"index\",\"_type\":\"\",\"_id\":\"123\",\"_score\":1.0}," +
" \"rating\":5}," +

View File

@ -71,8 +71,8 @@ setup:
"metric" : { "precision": { "ignore_unlabeled" : true }}
}
- match: { quality_level: 1}
- match: { details.amsterdam_query.quality_level: 1.0}
- match: { metric_score: 1}
- match: { details.amsterdam_query.metric_score: 1.0}
- match: { details.amsterdam_query.unrated_docs: [ {"_index": "foo", "_id": "doc4"}]}
- match: { details.amsterdam_query.metric_details.precision: {"relevant_docs_retrieved": 2, "docs_retrieved": 2}}
@ -84,7 +84,7 @@ setup:
- match: { details.amsterdam_query.hits.2.hit._id: "doc4"}
- is_false: details.amsterdam_query.hits.2.rating
- match: { details.berlin_query.quality_level: 1.0}
- match: { details.berlin_query.metric_score: 1.0}
- match: { details.berlin_query.unrated_docs: [ {"_index": "foo", "_id": "doc4"}]}
- match: { details.berlin_query.metric_details.precision: {"relevant_docs_retrieved": 1, "docs_retrieved": 1}}
- length: { details.berlin_query.hits: 2}
@ -118,9 +118,9 @@ setup:
"metric" : { "precision": { "ignore_unlabeled" : true }}
}
- match: { quality_level: 1}
- match: { details.amsterdam_query.quality_level: 1.0}
- match: { details.berlin_query.quality_level: 1.0}
- match: { metric_score: 1}
- match: { details.amsterdam_query.metric_score: 1.0}
- match: { details.berlin_query.metric_score: 1.0}
---
"Mean Reciprocal Rank":
@ -150,14 +150,14 @@ setup:
}
# average is (1/3 + 1/2)/2 = 5/12 ~ 0.41666666666666663
- gt: {quality_level: 0.416}
- lt: {quality_level: 0.417}
- gt: {details.amsterdam_query.quality_level: 0.333}
- lt: {details.amsterdam_query.quality_level: 0.334}
- gt: {metric_score: 0.416}
- lt: {metric_score: 0.417}
- gt: {details.amsterdam_query.metric_score: 0.333}
- lt: {details.amsterdam_query.metric_score: 0.334}
- match: {details.amsterdam_query.metric_details.mean_reciprocal_rank: {"first_relevant": 3}}
- match: {details.amsterdam_query.unrated_docs: [ {"_index": "foo", "_id": "doc2"},
{"_index": "foo", "_id": "doc3"} ]}
- match: {details.berlin_query.quality_level: 0.5}
- match: {details.berlin_query.metric_score: 0.5}
- match: {details.berlin_query.metric_details.mean_reciprocal_rank: {"first_relevant": 2}}
- match: {details.berlin_query.unrated_docs: [ {"_index": "foo", "_id": "doc1"}]}

View File

@ -69,10 +69,10 @@
"metric" : { "dcg": {}}
}
- gt: {quality_level: 13.848263 }
- lt: {quality_level: 13.848264 }
- gt: {details.dcg_query.quality_level: 13.848263}
- lt: {details.dcg_query.quality_level: 13.848264}
- gt: {metric_score: 13.848263 }
- lt: {metric_score: 13.848264 }
- gt: {details.dcg_query.metric_score: 13.848263}
- lt: {details.dcg_query.metric_score: 13.848264}
- match: {details.dcg_query.unrated_docs: [ ]}
# reverse the order in which the results are returned (less relevant docs first)
@ -96,10 +96,10 @@
"metric" : { "dcg": { }}
}
- gt: {quality_level: 10.299674}
- lt: {quality_level: 10.299675}
- gt: {details.dcg_query_reverse.quality_level: 10.299674}
- lt: {details.dcg_query_reverse.quality_level: 10.299675}
- gt: {metric_score: 10.299674}
- lt: {metric_score: 10.299675}
- gt: {details.dcg_query_reverse.metric_score: 10.299674}
- lt: {details.dcg_query_reverse.metric_score: 10.299675}
- match: {details.dcg_query_reverse.unrated_docs: [ ]}
# if we mix both, we should get the average
@ -134,11 +134,11 @@
"metric" : { "dcg": { }}
}
- gt: {quality_level: 12.073969}
- lt: {quality_level: 12.073970}
- gt: {details.dcg_query.quality_level: 13.848263}
- lt: {details.dcg_query.quality_level: 13.848264}
- gt: {metric_score: 12.073969}
- lt: {metric_score: 12.073970}
- gt: {details.dcg_query.metric_score: 13.848263}
- lt: {details.dcg_query.metric_score: 13.848264}
- match: {details.dcg_query.unrated_docs: [ ]}
- gt: {details.dcg_query_reverse.quality_level: 10.299674}
- lt: {details.dcg_query_reverse.quality_level: 10.299675}
- gt: {details.dcg_query_reverse.metric_score: 10.299674}
- lt: {details.dcg_query_reverse.metric_score: 10.299675}
- match: {details.dcg_query_reverse.unrated_docs: [ ]}

View File

@ -34,8 +34,8 @@
"metric" : { "precision": { "ignore_unlabeled" : true }}
}
- match: { quality_level: 1}
- match: { details.amsterdam_query.quality_level: 1.0}
- match: { metric_score: 1}
- match: { details.amsterdam_query.metric_score: 1.0}
- match: { details.amsterdam_query.unrated_docs: [ ]}
- match: { details.amsterdam_query.metric_details.precision: {"relevant_docs_retrieved": 1, "docs_retrieved": 1}}

View File

@ -84,7 +84,7 @@ setup:
"metric" : { "precision": { }}
}
- match: {quality_level: 0.9}
- match: {metric_score: 0.9}
- match: {details.amsterdam_query.unrated_docs.0._id: "6"}
---

View File

@ -52,5 +52,5 @@
"metric" : { "precision": { "ignore_unlabeled" : true }}
}
- match: { quality_level: 1 }
- match: { metric_score: 1 }