From fa459f88ddb08e2bf18fe8b6cdc720f6dfc79a71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Wed, 27 Jul 2016 15:22:14 +0200 Subject: [PATCH] Add normalization option When switched on, compute the normalized ndcg variant. --- .../rankeval/DiscountedCumulativeGainAt.java | 183 ++++++++++++++++++ .../rankeval/DiscountedCumulativeGainAtN.java | 118 ----------- .../rankeval/RankedListQualityMetric.java | 4 +- .../DiscountedCumulativeGainAtNTests.java | 70 ------- .../DiscountedCumulativeGainAtTests.java | 121 ++++++++++++ .../index/rankeval/ReciprocalRankTests.java | 14 +- 6 files changed, 316 insertions(+), 194 deletions(-) create mode 100644 modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAt.java delete mode 100644 modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAtN.java delete mode 100644 modules/rank-eval/src/test/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAtNTests.java create mode 100644 modules/rank-eval/src/test/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAtTests.java diff --git a/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAt.java b/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAt.java new file mode 100644 index 00000000000..2db310b185d --- /dev/null +++ b/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAt.java @@ -0,0 +1,183 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.rankeval; + +import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.ParseFieldMatcherSupplier; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.xcontent.ObjectParser; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.search.SearchHit; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class DiscountedCumulativeGainAt extends RankedListQualityMetric { + + /** rank position up to which to check results. */ + private int position; + /** If set to true, the dcg will be normalized (ndcg) */ + private boolean normalize; + /** If set to, this will be the rating for docs the user hasn't supplied an explicit rating for */ + private Integer unknownDocRating; + + public static final String NAME = "dcg_at_n"; + private static final double LOG2 = Math.log(2.0); + + public DiscountedCumulativeGainAt(StreamInput in) throws IOException { + position = in.readInt(); + normalize = in.readBoolean(); + unknownDocRating = in.readOptionalVInt(); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeInt(position); + out.writeBoolean(normalize); + out.writeOptionalVInt(unknownDocRating); + } + + @Override + public String getWriteableName() { + return NAME; + } + + /** + * Initialises position with 10 + * */ + public DiscountedCumulativeGainAt() { + this.position = 10; + } + + /** + * @param position number of top results to check against a given set of relevant results. Must be positive. + */ + public DiscountedCumulativeGainAt(int position) { + if (position <= 0) { + throw new IllegalArgumentException("number of results to check needs to be positive but was " + position); + } + this.position = position; + } + + /** + * Return number of search results to check for quality metric. + */ + public int getPosition() { + return this.position; + } + + /** + * set number of search results to check for quality metric. + */ + public void setPosition(int position) { + this.position = position; + } + + /** + * If set to true, the dcg will be normalized (ndcg) + */ + public void setNormalize(boolean normalize) { + this.normalize = normalize; + } + + /** + * check whether this metric computes only dcg or "normalized" ndcg + */ + public boolean getNormalize() { + return this.normalize; + } + + /** + * the rating for docs the user hasn't supplied an explicit rating for + */ + public void setUnknownDocRating(int unknownDocRating) { + this.unknownDocRating = unknownDocRating; + } + + /** + * check whether this metric computes only dcg or "normalized" ndcg + */ + public Integer getUnknownDocRating() { + return this.unknownDocRating; + } + + @Override + public EvalQueryQuality evaluate(SearchHit[] hits, List ratedDocs) { + Map ratedDocsById = new HashMap<>(); + for (RatedDocument doc : ratedDocs) { + ratedDocsById.put(doc.getDocID(), doc); + } + + Collection unknownDocIds = new ArrayList<>(); + List ratings = new ArrayList<>(); + for (int i = 0; (i < position && i < hits.length); i++) { + String id = hits[i].getId(); + RatedDocument ratedDoc = ratedDocsById.get(id); + if (ratedDoc != null) { + ratings.add(ratedDoc.getRating()); + } else { + unknownDocIds.add(id); + if (unknownDocRating != null) { + ratings.add(unknownDocRating); + } + } + } + double dcg = computeDCG(ratings); + + if (normalize) { + Collections.sort(ratings, Collections.reverseOrder()); + double idcg = computeDCG(ratings); + dcg = dcg / idcg; + } + return new EvalQueryQuality(dcg, unknownDocIds); + } + + private static double computeDCG(List ratings) { + int rank = 1; + double dcg = 0; + for (int rating : ratings) { + dcg += (Math.pow(2, rating) - 1) / ((Math.log(rank + 1) / LOG2)); + rank++; + } + return dcg; + } + + private static final ParseField SIZE_FIELD = new ParseField("size"); + private static final ParseField NORMALIZE_FIELD = new ParseField("normalize"); + private static final ParseField UNKNOWN_DOC_RATING_FIELD = new ParseField("unknown_doc_rating"); + private static final ObjectParser PARSER = + new ObjectParser<>("dcg_at", () -> new DiscountedCumulativeGainAt()); + + static { + PARSER.declareInt(DiscountedCumulativeGainAt::setPosition, SIZE_FIELD); + PARSER.declareBoolean(DiscountedCumulativeGainAt::setNormalize, NORMALIZE_FIELD); + PARSER.declareInt(DiscountedCumulativeGainAt::setUnknownDocRating, UNKNOWN_DOC_RATING_FIELD); + } + + public static DiscountedCumulativeGainAt fromXContent(XContentParser parser, ParseFieldMatcherSupplier matcher) { + return PARSER.apply(parser, matcher); + } +} diff --git a/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAtN.java b/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAtN.java deleted file mode 100644 index c1bed32952b..00000000000 --- a/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAtN.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.rankeval; - -import org.elasticsearch.common.ParseField; -import org.elasticsearch.common.ParseFieldMatcherSupplier; -import org.elasticsearch.common.io.stream.StreamInput; -import org.elasticsearch.common.io.stream.StreamOutput; -import org.elasticsearch.common.xcontent.ConstructingObjectParser; -import org.elasticsearch.common.xcontent.XContentParser; -import org.elasticsearch.search.SearchHit; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -public class DiscountedCumulativeGainAtN extends RankedListQualityMetric { - - /** Number of results to check against a given set of relevant results. */ - private int n; - - public static final String NAME = "dcg_at_n"; - private static final double LOG2 = Math.log(2.0); - - public DiscountedCumulativeGainAtN(StreamInput in) throws IOException { - n = in.readInt(); - } - - @Override - public void writeTo(StreamOutput out) throws IOException { - out.writeInt(n); - } - - @Override - public String getWriteableName() { - return NAME; - } - - /** - * Initialises n with 10 - * */ - public DiscountedCumulativeGainAtN() { - this.n = 10; - } - - /** - * @param n number of top results to check against a given set of relevant results. Must be positive. - */ - public DiscountedCumulativeGainAtN(int n) { - if (n <= 0) { - throw new IllegalArgumentException("number of results to check needs to be positive but was " + n); - } - this.n = n; - } - - /** - * Return number of search results to check for quality. - */ - public int getN() { - return n; - } - - @Override - public EvalQueryQuality evaluate(SearchHit[] hits, List ratedDocs) { - Map ratedDocsById = new HashMap<>(); - for (RatedDocument doc : ratedDocs) { - ratedDocsById.put(doc.getDocID(), doc); - } - - Collection unknownDocIds = new ArrayList(); - double dcg = 0; - - for (int i = 0; (i < n && i < hits.length); i++) { - int rank = i + 1; // rank is 1-based - String id = hits[i].getId(); - RatedDocument ratedDoc = ratedDocsById.get(id); - if (ratedDoc != null) { - int rel = ratedDoc.getRating(); - dcg += (Math.pow(2, rel) - 1) / ((Math.log(rank + 1) / LOG2)); - } else { - unknownDocIds.add(id); - } - } - return new EvalQueryQuality(dcg, unknownDocIds); - } - - private static final ParseField SIZE_FIELD = new ParseField("size"); - private static final ConstructingObjectParser PARSER = - new ConstructingObjectParser<>("dcg_at", a -> new DiscountedCumulativeGainAtN((Integer) a[0])); - - static { - PARSER.declareInt(ConstructingObjectParser.constructorArg(), SIZE_FIELD); - } - - public static DiscountedCumulativeGainAtN fromXContent(XContentParser parser, ParseFieldMatcherSupplier matcher) { - return PARSER.apply(parser, matcher); - } -} diff --git a/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/RankedListQualityMetric.java b/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/RankedListQualityMetric.java index ba28b6f9bb0..c346f0d0d59 100644 --- a/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/RankedListQualityMetric.java +++ b/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/RankedListQualityMetric.java @@ -63,8 +63,8 @@ public abstract class RankedListQualityMetric implements NamedWriteable { case ReciprocalRank.NAME: rc = ReciprocalRank.fromXContent(parser, context); break; - case DiscountedCumulativeGainAtN.NAME: - rc = DiscountedCumulativeGainAtN.fromXContent(parser, context); + case DiscountedCumulativeGainAt.NAME: + rc = DiscountedCumulativeGainAt.fromXContent(parser, context); break; default: throw new ParsingException(parser.getTokenLocation(), "[_na] unknown query metric name [{}]", metricName); diff --git a/modules/rank-eval/src/test/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAtNTests.java b/modules/rank-eval/src/test/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAtNTests.java deleted file mode 100644 index 2114feaa835..00000000000 --- a/modules/rank-eval/src/test/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAtNTests.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.rankeval; - -import org.elasticsearch.common.ParseFieldMatcher; -import org.elasticsearch.common.text.Text; -import org.elasticsearch.common.xcontent.XContentFactory; -import org.elasticsearch.common.xcontent.XContentParser; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.internal.InternalSearchHit; -import org.elasticsearch.test.ESTestCase; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.ExecutionException; - -public class DiscountedCumulativeGainAtNTests extends ESTestCase { - - /** - * Assuming the docs are ranked in the following order: - * - * rank | rel_rank | 2^(rel_rank) - 1 | log_2(rank + 1) | (2^(rel_rank) - 1) / log_2(rank + 1) - * ------------------------------------------------------------------------------------------- - * 1 | 3 | 7.0 | 1.0 | 7.0 - * 2 | 2 | 3.0 | 1.5849625007211563 | 1.8927892607143721 - * 3 | 3 | 7.0 | 2.0 | 3.5 - * 4 | 0 | 0.0 | 2.321928094887362 | 0.0 - * 5 | 1 | 1.0 | 2.584962500721156 | 0.38685280723454163 - * 6 | 2 | 3.0 | 2.807354922057604 | 1.0686215613240666 - */ - public void testDCGAtSix() throws IOException, InterruptedException, ExecutionException { - List rated = new ArrayList<>(); - int[] relevanceRatings = new int[] { 3, 2, 3, 0, 1, 2 }; - SearchHit[] hits = new InternalSearchHit[6]; - for (int i = 0; i < 6; i++) { - rated.add(new RatedDocument(Integer.toString(i), relevanceRatings[i])); - hits[i] = new InternalSearchHit(i, Integer.toString(i), new Text("type"), Collections.emptyMap()); - } - assertEquals(13.84826362927298d, (new DiscountedCumulativeGainAtN(6)).evaluate(hits, rated).getQualityLevel(), 0.00001); - } - - - public void testParseFromXContent() throws IOException { - String xContent = " {\n" - + " \"size\": 8\n" - + "}"; - XContentParser parser = XContentFactory.xContent(xContent).createParser(xContent); - DiscountedCumulativeGainAtN dcgAt = DiscountedCumulativeGainAtN.fromXContent(parser, () -> ParseFieldMatcher.STRICT); - assertEquals(8, dcgAt.getN()); - } -} diff --git a/modules/rank-eval/src/test/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAtTests.java b/modules/rank-eval/src/test/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAtTests.java new file mode 100644 index 00000000000..59b05a35ade --- /dev/null +++ b/modules/rank-eval/src/test/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainAtTests.java @@ -0,0 +1,121 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.rankeval; + +import org.elasticsearch.common.ParseFieldMatcher; +import org.elasticsearch.common.text.Text; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.search.SearchHit; +import org.elasticsearch.search.internal.InternalSearchHit; +import org.elasticsearch.test.ESTestCase; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.ExecutionException; + +public class DiscountedCumulativeGainAtTests extends ESTestCase { + + /** + * Assuming the docs are ranked in the following order: + * + * rank | rel_rank | 2^(rel_rank) - 1 | log_2(rank + 1) | (2^(rel_rank) - 1) / log_2(rank + 1) + * ------------------------------------------------------------------------------------------- + * 1 | 3 | 7.0 | 1.0 | 7.0 + * 2 | 2 | 3.0 | 1.5849625007211563 | 1.8927892607143721 + * 3 | 3 | 7.0 | 2.0 | 3.5 + * 4 | 0 | 0.0 | 2.321928094887362 | 0.0 + * 5 | 1 | 1.0 | 2.584962500721156 | 0.38685280723454163 + * 6 | 2 | 3.0 | 2.807354922057604 | 1.0686215613240666 + * + * dcg = 13.84826362927298 (sum of last column) + */ + public void testDCGAtSix() throws IOException, InterruptedException, ExecutionException { + List rated = new ArrayList<>(); + int[] relevanceRatings = new int[] { 3, 2, 3, 0, 1, 2 }; + SearchHit[] hits = new InternalSearchHit[6]; + for (int i = 0; i < 6; i++) { + rated.add(new RatedDocument(Integer.toString(i), relevanceRatings[i])); + hits[i] = new InternalSearchHit(i, Integer.toString(i), new Text("type"), Collections.emptyMap()); + } + DiscountedCumulativeGainAt dcg = new DiscountedCumulativeGainAt(6); + assertEquals(13.84826362927298, dcg.evaluate(hits, rated).getQualityLevel(), 0.00001); + + /** + * Check with normalization: to get the maximal possible dcg, sort documents by relevance in descending order + * + * rank | rel_rank | 2^(rel_rank) - 1 | log_2(rank + 1) | (2^(rel_rank) - 1) / log_2(rank + 1) + * ------------------------------------------------------------------------------------------- + * 1 | 3 | 7.0 | 1.0  | 7.0 + * 2 | 3 | 7.0 | 1.5849625007211563 | 4.416508275000202 + * 3 | 2 | 3.0 | 2.0  | 1.5 + * 4 | 2 | 3.0 | 2.321928094887362  | 1.2920296742201793 + * 5 | 1 | 1.0 | 2.584962500721156  | 0.38685280723454163 + * 6 | 0 | 0.0 | 2.807354922057604  | 0.0 + * + * idcg = 14.595390756454922 (sum of last column) + */ + dcg.setNormalize(true); + assertEquals(13.84826362927298 / 14.595390756454922, dcg.evaluate(hits, rated).getQualityLevel(), 0.00001); + } + + /** + * This tests metric when some documents in the search result don't have a rating provided by the user. + * + * rank | rel_rank | 2^(rel_rank) - 1 | log_2(rank + 1) | (2^(rel_rank) - 1) / log_2(rank + 1) + * ------------------------------------------------------------------------------------------- + * 1 | 3 | 7.0 | 1.0 | 7.0 + * 2 | 2 | 3.0 | 1.5849625007211563 | 1.8927892607143721 + * 3 | 3 | 7.0 | 2.0 | 3.5 + * 4 | n/a | n/a | n/a | n/a + * 5 | n/a | n/a | n/a | n/a + * 6 | n/a | n/a | n/a | n/a + * + * dcg = 13.84826362927298 (sum of last column) + */ + public void testDCGAtSixMissingRatings() throws IOException, InterruptedException, ExecutionException { + List rated = new ArrayList<>(); + int[] relevanceRatings = new int[] { 3, 2, 3}; + SearchHit[] hits = new InternalSearchHit[6]; + for (int i = 0; i < 6; i++) { + if (i < relevanceRatings.length) { + rated.add(new RatedDocument(Integer.toString(i), relevanceRatings[i])); + } + hits[i] = new InternalSearchHit(i, Integer.toString(i), new Text("type"), Collections.emptyMap()); + } + DiscountedCumulativeGainAt dcg = new DiscountedCumulativeGainAt(6); + EvalQueryQuality result = dcg.evaluate(hits, rated); + assertEquals(12.392789260714371, result.getQualityLevel(), 0.00001); + assertEquals(3, result.getUnknownDocs().size()); + } + + public void testParseFromXContent() throws IOException { + String xContent = " {\n" + + " \"size\": 8,\n" + + " \"normalize\": true\n" + + "}"; + XContentParser parser = XContentFactory.xContent(xContent).createParser(xContent); + DiscountedCumulativeGainAt dcgAt = DiscountedCumulativeGainAt.fromXContent(parser, () -> ParseFieldMatcher.STRICT); + assertEquals(8, dcgAt.getPosition()); + assertEquals(true, dcgAt.getNormalize()); + } +} diff --git a/modules/rank-eval/src/test/java/org/elasticsearch/index/rankeval/ReciprocalRankTests.java b/modules/rank-eval/src/test/java/org/elasticsearch/index/rankeval/ReciprocalRankTests.java index 12dd808cff7..e87905cb1b4 100644 --- a/modules/rank-eval/src/test/java/org/elasticsearch/index/rankeval/ReciprocalRankTests.java +++ b/modules/rank-eval/src/test/java/org/elasticsearch/index/rankeval/ReciprocalRankTests.java @@ -58,11 +58,17 @@ public class ReciprocalRankTests extends ESTestCase { int rankAtFirstRelevant = relevantAt + 1; EvalQueryQuality evaluation = reciprocalRank.evaluate(hits, ratedDocs); - assertEquals(1.0 / rankAtFirstRelevant, evaluation.getQualityLevel(), Double.MIN_VALUE); + if (rankAtFirstRelevant <= maxRank) { + assertEquals(1.0 / rankAtFirstRelevant, evaluation.getQualityLevel(), Double.MIN_VALUE); - reciprocalRank = new ReciprocalRank(rankAtFirstRelevant - 1); - evaluation = reciprocalRank.evaluate(hits, ratedDocs); - assertEquals(0.0, evaluation.getQualityLevel(), Double.MIN_VALUE); + // check that if we lower maxRank by one, we don't find any result and get 0.0 quality level + reciprocalRank = new ReciprocalRank(rankAtFirstRelevant - 1); + evaluation = reciprocalRank.evaluate(hits, ratedDocs); + assertEquals(0.0, evaluation.getQualityLevel(), Double.MIN_VALUE); + + } else { + assertEquals(0.0, evaluation.getQualityLevel(), Double.MIN_VALUE); + } } public void testEvaluationOneRelevantInResults() {