significant terms: add google normalized distance, add chi square

closes #6858
2025-03-09 14:34:43 +00:00 · 2014-07-12 11:53:08 +02:00 · 2014-07-12 11:53:08 +02:00 · a3cefd919e
commit a3cefd919e
parent 5f0719fd50
11 changed files with 758 additions and 301 deletions
--- a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc
+++ b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc
@ -306,7 +306,45 @@ Per default, the assumption is that the documents in the bucket are also contain
 "background_is_superset": false
 --------------------------------------------------

- 
+
+===== Chi square
+added[1.4.0]
+
+Chi square as described in "Information Retrieval", Manning et al., Chapter 13.5.2 can be used as significance score by adding the parameter
+	
+[source,js]
+--------------------------------------------------
+
+	 "chi_square": {
+	 }
+--------------------------------------------------
+
+Chi square behaves like mutual information and can be configured with the same parameters `include_negatives` and `background_is_superset`.
+
+
+===== google normalized distance
+added[1.4.0]
+
+Google normalized distance  as described in "The Google Similarity Distance", Cilibrasi and Vitanyi, 2007 (http://arxiv.org/pdf/cs/0412098v3.pdf) can be used as significance score by adding the parameter
+	
+[source,js]
+--------------------------------------------------
+
+	 "gnd": {
+	 }
+--------------------------------------------------
+
+`gnd` also accepts the `background_is_superset` parameter. 
+
+===== Which one is best?
+
+
+Roughly, `mutual_information` prefers high frequent terms even if they occur also frequently in the background. For example, in an analysis of natural language text this might lead to selection of stop words. `mutual_information` is unlikely to select very rare terms like misspellings. `gnd` prefers terms with a high co-occurence and avoids selection of stopwords. It might be better suited for synonym detection. However, `gnd` has a tendency to select very rare terms that are, for example, a result of misspelling. `chi_square` and `jlh` are somewhat in-between.
+
+It is hard to say which one of the different heuristics will be the best choice as it depends on what the significant terms are used for (see for example [Yang and Pedersen, "A Comparative Study on Feature Selection in Text Categorization", 1997](http://courses.ischool.berkeley.edu/i256/f06/papers/yang97comparative.pdf) for a study on using significant terms for feature selection for text classification).
+
+
+
 	
 ===== Size & Shard Size

--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/ChiSquare.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/ChiSquare.java
@ -0,0 +1,115 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
+
+
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+
+import java.io.IOException;
+
+public class ChiSquare extends NXYSignificanceHeuristic {
+
+    protected static final ParseField NAMES_FIELD = new ParseField("chi_square");
+
+    public ChiSquare(boolean includeNegatives, boolean backgroundIsSuperset) {
+        super(includeNegatives, backgroundIsSuperset);
+    }
+
+    @Override
+    public boolean equals(Object other) {
+        if (!(other instanceof ChiSquare)) {
+            return false;
+        }
+        return super.equals(other);
+    }
+
+    @Override
+    public int hashCode() {
+        int result = NAMES_FIELD.getPreferredName().hashCode();
+        result = 31 * result + super.hashCode();
+        return result;
+    }
+
+    public static final SignificanceHeuristicStreams.Stream STREAM = new SignificanceHeuristicStreams.Stream() {
+        @Override
+        public SignificanceHeuristic readResult(StreamInput in) throws IOException {
+            return new ChiSquare(in.readBoolean(), in.readBoolean());
+        }
+
+        @Override
+        public String getName() {
+            return NAMES_FIELD.getPreferredName();
+        }
+    };
+
+    /**
+     * Calculates Chi^2
+     * see "Information Retrieval", Manning et al., Eq. 13.19
+     */
+    @Override
+    public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
+        Frequencies frequencies = computeNxys(subsetFreq, subsetSize, supersetFreq, supersetSize, "ChiSquare");
+
+        // here we check if the term appears more often in subset than in background without subset.
+        if (!includeNegatives && frequencies.N11 / frequencies.N_1 < frequencies.N10 / frequencies.N_0) {
+            return Double.NEGATIVE_INFINITY;
+        }
+        return (frequencies.N * Math.pow((frequencies.N11 * frequencies.N00 - frequencies.N01 * frequencies.N10), 2.0) /
+                ((frequencies.N_1) * (frequencies.N1_) * (frequencies.N0_) * (frequencies.N_0)));
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        out.writeString(STREAM.getName());
+        super.writeTo(out);
+    }
+
+    public static class ChiSquareParser extends NXYParser {
+
+        @Override
+        protected SignificanceHeuristic newHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) {
+            return new ChiSquare(includeNegatives, backgroundIsSuperset);
+        }
+
+        @Override
+        public String[] getNames() {
+            return NAMES_FIELD.getAllNamesIncludedDeprecated();
+        }
+    }
+
+    public static class ChiSquareBuilder extends NXYSignificanceHeuristic.NXYBuilder {
+
+        public ChiSquareBuilder(boolean includeNegatives, boolean backgroundIsSuperset) {
+            super(includeNegatives, backgroundIsSuperset);
+        }
+
+        @Override
+        public void toXContent(XContentBuilder builder) throws IOException {
+            builder.startObject(STREAM.getName());
+            super.build(builder);
+            builder.endObject();
+        }
+    }
+}
+
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/GND.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/GND.java
@ -0,0 +1,148 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
+
+
+import org.elasticsearch.ElasticsearchParseException;
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.index.query.QueryParsingException;
+
+import java.io.IOException;
+
+public class GND extends NXYSignificanceHeuristic {
+
+    protected static final ParseField NAMES_FIELD = new ParseField("gnd");
+
+    public GND(boolean backgroundIsSuperset) {
+        super(true, backgroundIsSuperset);
+    }
+
+
+    @Override
+    public boolean equals(Object other) {
+        if (!(other instanceof GND)) {
+            return false;
+        }
+        return super.equals(other);
+    }
+
+    @Override
+    public int hashCode() {
+        int result = NAMES_FIELD.getPreferredName().hashCode();
+        result = 31 * result + super.hashCode();
+        return result;
+    }
+
+    public static final SignificanceHeuristicStreams.Stream STREAM = new SignificanceHeuristicStreams.Stream() {
+        @Override
+        public SignificanceHeuristic readResult(StreamInput in) throws IOException {
+            return new GND(in.readBoolean());
+        }
+
+        @Override
+        public String getName() {
+            return NAMES_FIELD.getPreferredName();
+        }
+    };
+
+    /**
+     * Calculates Google Normalized Distance, as described in "The Google Similarity Distance", Cilibrasi and Vitanyi, 2007
+     * link: http://arxiv.org/pdf/cs/0412098v3.pdf
+     */
+    @Override
+    public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
+
+        Frequencies frequencies = computeNxys(subsetFreq, subsetSize, supersetFreq, supersetSize, "GND");
+        double fx = frequencies.N1_;
+        double fy = frequencies.N_1;
+        double fxy = frequencies.N11;
+        double N = frequencies.N;
+        if (fxy == 0) {
+            // no co-occurrence
+            return 0.0;
+        }
+        if ((fx == fy) && (fx == fxy)) {
+            // perfect co-occurrence
+            return 1.0;
+        }
+        double score = (Math.max(Math.log(fx), Math.log(fy)) - Math.log(fxy)) /
+                (Math.log(N) - Math.min(Math.log(fx), Math.log(fy)));
+
+        //we must invert the order of terms because GND scores relevant terms low
+        score = Math.exp(-1.0d * score);
+        return score;
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        out.writeString(STREAM.getName());
+        out.writeBoolean(backgroundIsSuperset);
+    }
+
+    public static class GNDParser extends NXYParser {
+
+        @Override
+        public String[] getNames() {
+            return NAMES_FIELD.getAllNamesIncludedDeprecated();
+        }
+
+        protected SignificanceHeuristic newHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) {
+            return new GND(backgroundIsSuperset);
+        }
+
+        @Override
+        public SignificanceHeuristic parse(XContentParser parser) throws IOException, QueryParsingException {
+            String givenName = parser.currentName();
+            boolean backgroundIsSuperset = true;
+            XContentParser.Token token = parser.nextToken();
+            while (!token.equals(XContentParser.Token.END_OBJECT)) {
+                if (BACKGROUND_IS_SUPERSET.match(parser.currentName(), ParseField.EMPTY_FLAGS)) {
+                    parser.nextToken();
+                    backgroundIsSuperset = parser.booleanValue();
+                } else {
+                    throw new ElasticsearchParseException("Field " + parser.currentName().toString() + " unknown for " + givenName);
+                }
+                token = parser.nextToken();
+            }
+            return newHeuristic(true, backgroundIsSuperset);
+        }
+
+    }
+
+    public static class GNDBuilder extends NXYBuilder {
+
+        public GNDBuilder(boolean backgroundIsSuperset) {
+            super(true, backgroundIsSuperset);
+        }
+
+        @Override
+        public void toXContent(XContentBuilder builder) throws IOException {
+            builder.startObject(STREAM.getName());
+            builder.field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset);
+            builder.endObject();
+        }
+    }
+}
+
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/JLHScore.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/JLHScore.java
@ -31,7 +31,7 @@ import org.elasticsearch.index.query.QueryParsingException;

 import java.io.IOException;

-public class JLHScore implements SignificanceHeuristic {
+public class JLHScore extends SignificanceHeuristic {

    public static final JLHScore INSTANCE = new JLHScore();

@ -59,25 +59,10 @@ public class JLHScore implements SignificanceHeuristic {
     * Calculates the significance of a term in a sample against a background of
     * normal distributions by comparing the changes in frequency. This is the heart
     * of the significant terms feature.
-     * <p/>
-     *
-     * @param subsetFreq   The frequency of the term in the selected sample
-     * @param subsetSize   The size of the selected sample (typically number of docs)
-     * @param supersetFreq The frequency of the term in the superset from which the sample was taken
-     * @param supersetSize The size of the superset from which the sample was taken  (typically number of docs)
-     * @return a "significance" score
     */
    @Override
    public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
-        if (subsetFreq < 0 || subsetSize < 0 || supersetFreq < 0 || supersetSize < 0) {
-            throw new ElasticsearchIllegalArgumentException("Frequencies of subset and superset must be positive in JLHScore.getScore()");
-        }
-        if (subsetFreq > subsetSize) {
-            throw new ElasticsearchIllegalArgumentException("subsetFreq > subsetSize, in JLHScore.score(..)");
-        }
-        if (supersetFreq > supersetSize) {
-            throw new ElasticsearchIllegalArgumentException("supersetFreq > supersetSize, in JLHScore.score(..)");
-        }
+        checkFrequencyValidity(subsetFreq, subsetSize, supersetFreq, supersetSize, "JLHScore");
        if ((subsetSize == 0) || (supersetSize == 0)) {
            // avoid any divide by zero issues
            return 0;
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/MutualInformation.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/MutualInformation.java
@ -21,50 +21,36 @@
 package org.elasticsearch.search.aggregations.bucket.significant.heuristics;


-import org.elasticsearch.ElasticsearchIllegalArgumentException;
-import org.elasticsearch.ElasticsearchParseException;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.xcontent.XContentBuilder;
-import org.elasticsearch.common.xcontent.XContentParser;
-import org.elasticsearch.index.query.QueryParsingException;

 import java.io.IOException;

-public class MutualInformation implements SignificanceHeuristic {
+public class MutualInformation extends NXYSignificanceHeuristic {

    protected static final ParseField NAMES_FIELD = new ParseField("mutual_information");

-    protected static final ParseField INCLUDE_NEGATIVES_FIELD = new ParseField("include_negatives");
-
-    protected static final ParseField BACKGROUND_IS_SUPERSET = new ParseField("background_is_superset");
-
-    protected static final String SCORE_ERROR_MESSAGE = ", does your background filter not include all documents in the bucket? If so and it is intentional, set \"" + BACKGROUND_IS_SUPERSET.getPreferredName() + "\": false";
-
    private static final double log2 = Math.log(2.0);

-    /**
-     * Mutual information does not differentiate between terms that are descriptive for subset or for
-     * the background without the subset. We might want to filter out the terms that are appear much less often
-     * in the subset than in the background without the subset.
-     */
-    protected boolean includeNegatives = false;
-    private boolean backgroundIsSuperset = true;
-
-    private MutualInformation() {};
-
    public MutualInformation(boolean includeNegatives, boolean backgroundIsSuperset) {
-        this.includeNegatives = includeNegatives;
-        this.backgroundIsSuperset = backgroundIsSuperset;
+        super(includeNegatives, backgroundIsSuperset);
    }

    @Override
    public boolean equals(Object other) {
-        if (! (other instanceof MutualInformation)) {
+        if (!(other instanceof MutualInformation)) {
            return false;
        }
-        return ((MutualInformation)other).includeNegatives == includeNegatives && ((MutualInformation)other).backgroundIsSuperset == backgroundIsSuperset;
+        return super.equals(other);
+    }
+
+    @Override
+    public int hashCode() {
+        int result = NAMES_FIELD.getPreferredName().hashCode();
+        result = 31 * result + super.hashCode();
+        return result;
    }

    public static final SignificanceHeuristicStreams.Stream STREAM = new SignificanceHeuristicStreams.Stream() {
@ -82,88 +68,23 @@ public class MutualInformation implements SignificanceHeuristic {
    /**
     * Calculates mutual information
     * see "Information Retrieval", Manning et al., Eq. 13.17
-     *
-     * @param subsetFreq     The frequency of the term in the selected sample
-     * @param subsetSize   The size of the selected sample (typically number of docs)
-     * @param supersetFreq   The frequency of the term in the superset from which the sample was taken
-     * @param supersetSize The size of the superset from which the sample was taken  (typically number of docs)
-     * @return a "significance" score
     */
    @Override
    public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
-        if (subsetFreq < 0 || subsetSize < 0 || supersetFreq < 0 || supersetSize < 0) {
-            throw new ElasticsearchIllegalArgumentException("Frequencies of subset and superset must be positive in MutualInformation.getScore()");
-        }
-        if (subsetFreq > subsetSize) {
-            throw new ElasticsearchIllegalArgumentException("subsetFreq > subsetSize, in MutualInformation.score(..)");
-        }
-        if (supersetFreq > supersetSize) {
-            throw new ElasticsearchIllegalArgumentException("supersetFreq > supersetSize, in MutualInformation.score(..)");
-        }
-        if (backgroundIsSuperset) {
-            if (subsetFreq > supersetFreq) {
-                throw new ElasticsearchIllegalArgumentException("subsetFreq > supersetFreq" + SCORE_ERROR_MESSAGE);
-            }
-            if (subsetSize > supersetSize) {
-                throw new ElasticsearchIllegalArgumentException("subsetSize > supersetSize" + SCORE_ERROR_MESSAGE);
-            }
-            if (supersetFreq - subsetFreq > supersetSize - subsetSize) {
-                throw new ElasticsearchIllegalArgumentException("supersetFreq - subsetFreq > supersetSize - subsetSize" + SCORE_ERROR_MESSAGE);
-            }
-        }
-        double N00, N01, N10, N11, N0_, N1_, N_0, N_1, N;
-        if (backgroundIsSuperset) {
-            //documents not in class and do not contain term
-            N00 = supersetSize - supersetFreq - (subsetSize - subsetFreq);
-            //documents in class and do not contain term
-            N01 = (subsetSize - subsetFreq);
-            // documents not in class and do contain term
-            N10 = supersetFreq - subsetFreq;
-            // documents in class and do contain term
-            N11 = subsetFreq;
-            //documents that do not contain term
-            N0_ = supersetSize - supersetFreq;
-            //documents that contain term
-            N1_ = supersetFreq;
-            //documents that are not in class
-            N_0 = supersetSize - subsetSize;
-            //documents that are in class
-            N_1 = subsetSize;
-            //all docs
-            N = supersetSize;
-        } else {
-            //documents not in class and do not contain term
-            N00 = supersetSize - supersetFreq;
-            //documents in class and do not contain term
-            N01 = subsetSize - subsetFreq;
-            // documents not in class and do contain term
-            N10 = supersetFreq;
-            // documents in class and do contain term
-            N11 = subsetFreq;
-            //documents that do not contain term
-            N0_ = supersetSize - supersetFreq + subsetSize - subsetFreq;
-            //documents that contain term
-            N1_ = supersetFreq + subsetFreq;
-            //documents that are not in class
-            N_0 = supersetSize;
-            //documents that are in class
-            N_1 = subsetSize;
-            //all docs
-            N = supersetSize + subsetSize;
-        }
+        Frequencies frequencies = computeNxys(subsetFreq, subsetSize, supersetFreq, supersetSize, "MutualInformation");

-        double score = (getMITerm(N00, N0_, N_0, N) +
-                getMITerm(N01, N0_, N_1, N) +
-                getMITerm(N10, N1_, N_0, N) +
-                getMITerm(N11, N1_, N_1, N))
+        double score = (getMITerm(frequencies.N00, frequencies.N0_, frequencies.N_0, frequencies.N) +
+                getMITerm(frequencies.N01, frequencies.N0_, frequencies.N_1, frequencies.N) +
+                getMITerm(frequencies.N10, frequencies.N1_, frequencies.N_0, frequencies.N) +
+                getMITerm(frequencies.N11, frequencies.N1_, frequencies.N_1, frequencies.N))
                / log2;

        if (Double.isNaN(score)) {
-            score = -1.0 * Float.MAX_VALUE;
+            score = Double.NEGATIVE_INFINITY;
        }
        // here we check if the term appears more often in subset than in background without subset.
-        if (!includeNegatives && N11 / N_1 < N10 / N_0) {
-            score = -1.0 * Double.MAX_VALUE;
+        if (!includeNegatives && frequencies.N11 / frequencies.N_1 < frequencies.N10 / frequencies.N_0) {
+            score = Double.NEGATIVE_INFINITY;
        }
        return score;
    }
@ -194,43 +115,13 @@ public class MutualInformation implements SignificanceHeuristic {
    @Override
    public void writeTo(StreamOutput out) throws IOException {
        out.writeString(STREAM.getName());
-        out.writeBoolean(includeNegatives);
-        out.writeBoolean(backgroundIsSuperset);
-
+        super.writeTo(out);
    }

-    public boolean getIncludeNegatives() {
-        return includeNegatives;
-    }
-
-    @Override
-    public int hashCode() {
-        int result = (includeNegatives ? 1 : 0);
-        result = 31 * result + (backgroundIsSuperset ? 1 : 0);
-        return result;
-    }
-
-    public static class MutualInformationParser implements SignificanceHeuristicParser {
+    public static class MutualInformationParser extends NXYParser {

        @Override
-        public SignificanceHeuristic parse(XContentParser parser) throws IOException, QueryParsingException {
-            NAMES_FIELD.match(parser.currentName(), ParseField.EMPTY_FLAGS);
-            boolean includeNegatives = false;
-            boolean backgroundIsSuperset = true;
-            XContentParser.Token token = parser.nextToken();
-            while (!token.equals(XContentParser.Token.END_OBJECT)) {
-                if (INCLUDE_NEGATIVES_FIELD.match(parser.currentName(), ParseField.EMPTY_FLAGS)) {
-                    parser.nextToken();
-                    includeNegatives = parser.booleanValue();
-                } else if (BACKGROUND_IS_SUPERSET.match(parser.currentName(), ParseField.EMPTY_FLAGS)) {
-                    parser.nextToken();
-                    backgroundIsSuperset = parser.booleanValue();
-                } else {
-                    throw new ElasticsearchParseException("Field " + parser.currentName().toString() + " unknown for mutual_information.");
-                }
-                token = parser.nextToken();
-            }
-            // move to the closing bracket
+        protected SignificanceHeuristic newHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) {
            return new MutualInformation(includeNegatives, backgroundIsSuperset);
        }

@ -240,23 +131,17 @@ public class MutualInformation implements SignificanceHeuristic {
        }
    }

-    public static class MutualInformationBuilder implements SignificanceHeuristicBuilder {
-
-        boolean includeNegatives = true;
-        private boolean backgroundIsSuperset = true;
-
-        private MutualInformationBuilder() {};
+    public static class MutualInformationBuilder extends NXYBuilder {

        public MutualInformationBuilder(boolean includeNegatives, boolean backgroundIsSuperset) {
-            this.includeNegatives = includeNegatives;
-            this.backgroundIsSuperset = backgroundIsSuperset;
+            super(includeNegatives, backgroundIsSuperset);
        }
+
        @Override
        public void toXContent(XContentBuilder builder) throws IOException {
-            builder.startObject(STREAM.getName())
-                    .field(INCLUDE_NEGATIVES_FIELD.getPreferredName(), includeNegatives)
-                    .field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset)
-                    .endObject();
+            builder.startObject(STREAM.getName());
+            super.build(builder);
+            builder.endObject();
        }
    }
 }
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/NXYSignificanceHeuristic.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/NXYSignificanceHeuristic.java
@ -0,0 +1,180 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
+
+
+import org.elasticsearch.ElasticsearchIllegalArgumentException;
+import org.elasticsearch.ElasticsearchParseException;
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.index.query.QueryParsingException;
+
+import java.io.IOException;
+
+public abstract class NXYSignificanceHeuristic extends SignificanceHeuristic {
+
+    protected static final ParseField BACKGROUND_IS_SUPERSET = new ParseField("background_is_superset");
+
+    protected static final ParseField INCLUDE_NEGATIVES_FIELD = new ParseField("include_negatives");
+
+    protected static final String SCORE_ERROR_MESSAGE = ", does your background filter not include all documents in the bucket? If so and it is intentional, set \"" + BACKGROUND_IS_SUPERSET.getPreferredName() + "\": false";
+
+    protected final boolean backgroundIsSuperset;
+
+    /**
+     * Some heuristics do not differentiate between terms that are descriptive for subset or for
+     * the background without the subset. We might want to filter out the terms that are appear much less often
+     * in the subset than in the background without the subset.
+     */
+    protected final boolean includeNegatives;
+
+    public NXYSignificanceHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) {
+        this.includeNegatives = includeNegatives;
+        this.backgroundIsSuperset = backgroundIsSuperset;
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        out.writeBoolean(includeNegatives);
+        out.writeBoolean(backgroundIsSuperset);
+    }
+
+    @Override
+    public boolean equals(Object other) {
+        return ((NXYSignificanceHeuristic) other).includeNegatives == includeNegatives && ((NXYSignificanceHeuristic) other).backgroundIsSuperset == backgroundIsSuperset;
+    }
+
+    @Override
+    public int hashCode() {
+        int result = (includeNegatives ? 1 : 0);
+        result = 31 * result + (backgroundIsSuperset ? 1 : 0);
+        return result;
+    }
+
+    protected static class Frequencies {
+        double N00, N01, N10, N11, N0_, N1_, N_0, N_1, N;
+    }
+
+    protected Frequencies computeNxys(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize, String scoreFunctionName) {
+        checkFrequencies(subsetFreq, subsetSize, supersetFreq, supersetSize, scoreFunctionName);
+        Frequencies frequencies = new Frequencies();
+        if (backgroundIsSuperset) {
+            //documents not in class and do not contain term
+            frequencies.N00 = supersetSize - supersetFreq - (subsetSize - subsetFreq);
+            //documents in class and do not contain term
+            frequencies.N01 = (subsetSize - subsetFreq);
+            // documents not in class and do contain term
+            frequencies.N10 = supersetFreq - subsetFreq;
+            // documents in class and do contain term
+            frequencies.N11 = subsetFreq;
+            //documents that do not contain term
+            frequencies.N0_ = supersetSize - supersetFreq;
+            //documents that contain term
+            frequencies.N1_ = supersetFreq;
+            //documents that are not in class
+            frequencies.N_0 = supersetSize - subsetSize;
+            //documents that are in class
+            frequencies.N_1 = subsetSize;
+            //all docs
+            frequencies.N = supersetSize;
+        } else {
+            //documents not in class and do not contain term
+            frequencies.N00 = supersetSize - supersetFreq;
+            //documents in class and do not contain term
+            frequencies.N01 = subsetSize - subsetFreq;
+            // documents not in class and do contain term
+            frequencies.N10 = supersetFreq;
+            // documents in class and do contain term
+            frequencies.N11 = subsetFreq;
+            //documents that do not contain term
+            frequencies.N0_ = supersetSize - supersetFreq + subsetSize - subsetFreq;
+            //documents that contain term
+            frequencies.N1_ = supersetFreq + subsetFreq;
+            //documents that are not in class
+            frequencies.N_0 = supersetSize;
+            //documents that are in class
+            frequencies.N_1 = subsetSize;
+            //all docs
+            frequencies.N = supersetSize + subsetSize;
+        }
+        return frequencies;
+    }
+
+    protected void checkFrequencies(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize, String scoreFunctionName) {
+        checkFrequencyValidity(subsetFreq, subsetSize, supersetFreq, supersetSize, scoreFunctionName);
+        if (backgroundIsSuperset) {
+            if (subsetFreq > supersetFreq) {
+                throw new ElasticsearchIllegalArgumentException("subsetFreq > supersetFreq" + SCORE_ERROR_MESSAGE);
+            }
+            if (subsetSize > supersetSize) {
+                throw new ElasticsearchIllegalArgumentException("subsetSize > supersetSize" + SCORE_ERROR_MESSAGE);
+            }
+            if (supersetFreq - subsetFreq > supersetSize - subsetSize) {
+                throw new ElasticsearchIllegalArgumentException("supersetFreq - subsetFreq > supersetSize - subsetSize" + SCORE_ERROR_MESSAGE);
+            }
+        }
+    }
+
+    public static abstract class NXYParser implements SignificanceHeuristicParser {
+
+        @Override
+        public SignificanceHeuristic parse(XContentParser parser) throws IOException, QueryParsingException {
+            String givenName = parser.currentName();
+            boolean includeNegatives = false;
+            boolean backgroundIsSuperset = true;
+            XContentParser.Token token = parser.nextToken();
+            while (!token.equals(XContentParser.Token.END_OBJECT)) {
+                if (INCLUDE_NEGATIVES_FIELD.match(parser.currentName())) {
+                    parser.nextToken();
+                    includeNegatives = parser.booleanValue();
+                } else if (BACKGROUND_IS_SUPERSET.match(parser.currentName())) {
+                    parser.nextToken();
+                    backgroundIsSuperset = parser.booleanValue();
+                } else {
+                    throw new ElasticsearchParseException("Field " + parser.currentName().toString() + " unknown for " + givenName);
+                }
+                token = parser.nextToken();
+            }
+            return newHeuristic(includeNegatives, backgroundIsSuperset);
+        }
+
+        protected abstract SignificanceHeuristic newHeuristic(boolean includeNegatives, boolean backgroundIsSuperset);
+    }
+
+
+    protected abstract static class NXYBuilder implements SignificanceHeuristicBuilder {
+        protected boolean includeNegatives = true;
+        protected boolean backgroundIsSuperset = true;
+
+        public NXYBuilder(boolean includeNegatives, boolean backgroundIsSuperset) {
+            this.includeNegatives = includeNegatives;
+            this.backgroundIsSuperset = backgroundIsSuperset;
+        }
+
+        protected void build(XContentBuilder builder) throws IOException {
+            builder.field(INCLUDE_NEGATIVES_FIELD.getPreferredName(), includeNegatives)
+                    .field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset);
+        }
+    }
+}
+
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificanceHeuristic.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificanceHeuristic.java
@ -20,13 +20,32 @@
 package org.elasticsearch.search.aggregations.bucket.significant.heuristics;


+import org.elasticsearch.ElasticsearchIllegalArgumentException;
 import org.elasticsearch.common.io.stream.StreamOutput;

 import java.io.IOException;

-public interface SignificanceHeuristic {
+public abstract class SignificanceHeuristic {
+    /**
+     * @param subsetFreq   The frequency of the term in the selected sample
+     * @param subsetSize   The size of the selected sample (typically number of docs)
+     * @param supersetFreq The frequency of the term in the superset from which the sample was taken
+     * @param supersetSize The size of the superset from which the sample was taken  (typically number of docs)
+     * @return a "significance" score
+     */
+    public abstract double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize);

-    public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize);
+    abstract public void writeTo(StreamOutput out) throws IOException;

-    void writeTo(StreamOutput out) throws IOException;
+    protected void checkFrequencyValidity(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize, String scoreFunctionName) {
+        if (subsetFreq < 0 || subsetSize < 0 || supersetFreq < 0 || supersetSize < 0) {
+            throw new ElasticsearchIllegalArgumentException("Frequencies of subset and superset must be positive in " + scoreFunctionName + ".getScore()");
+        }
+        if (subsetFreq > subsetSize) {
+            throw new ElasticsearchIllegalArgumentException("subsetFreq > subsetSize, in JLHScore.score(..)");
+        }
+        if (supersetFreq > supersetSize) {
+            throw new ElasticsearchIllegalArgumentException("supersetFreq > supersetSize, in JLHScore.score(..)");
+        }
+    }
 }
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificantTermsHeuristicModule.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificantTermsHeuristicModule.java
@ -35,6 +35,8 @@ public class SignificantTermsHeuristicModule extends AbstractModule {
    public SignificantTermsHeuristicModule() {
        registerHeuristic(JLHScore.JLHScoreParser.class, JLHScore.STREAM);
        registerHeuristic(MutualInformation.MutualInformationParser.class, MutualInformation.STREAM);
+        registerHeuristic(GND.GNDParser.class, GND.STREAM);
+        registerHeuristic(ChiSquare.ChiSquareParser.class, ChiSquare.STREAM);
    }

    public void registerHeuristic(Class<? extends SignificanceHeuristicParser> parser, SignificanceHeuristicStreams.Stream stream) {
--- a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreTests.java
+++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreTests.java
@ -163,7 +163,7 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
        }
    }

-    public static class SimpleHeuristic implements SignificanceHeuristic {
+    public static class SimpleHeuristic extends SignificanceHeuristic {

        protected static final String[] NAMES = {"simple"};

@ -259,15 +259,22 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra

    }

-    // compute significance score by
-    // 1. terms agg on class and significant terms
-    // 2. filter buckets and set the background to the other class and set is_background false
-    // both should yield exact same result
    @Test
    public void testBackgroundVsSeparateSet() throws Exception {
        String type = randomBoolean() ? "string" : "long";
        String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}";
        index01Docs(type, settings);
+        testBackgroundVsSeparateSet(new MutualInformation.MutualInformationBuilder(true, true), new MutualInformation.MutualInformationBuilder(true, false));
+        testBackgroundVsSeparateSet(new ChiSquare.ChiSquareBuilder(true, true), new ChiSquare.ChiSquareBuilder(true, false));
+        testBackgroundVsSeparateSet(new GND.GNDBuilder(true), new GND.GNDBuilder(false));
+    }
+
+    // compute significance score by
+    // 1. terms agg on class and significant terms
+    // 2. filter buckets and set the background to the other class and set is_background false
+    // both should yield exact same result
+    public void testBackgroundVsSeparateSet(SignificanceHeuristicBuilder significanceHeuristicExpectingSuperset, SignificanceHeuristicBuilder significanceHeuristicExpectingSeparateSets) throws Exception {
+
        SearchResponse response1 = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE)
                .addAggregation(new TermsBuilder("class")
                        .field(CLASS_FIELD)
@ -276,7 +283,7 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
                                        .field(TEXT_FIELD)
                                        .minDocCount(1)
                                        .significanceHeuristic(
-                                                new MutualInformation.MutualInformationBuilder(true, true))))
+                                                significanceHeuristicExpectingSuperset)))
                .execute()
                .actionGet();
        assertSearchResponse(response1);
@ -287,14 +294,14 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
                                .field(TEXT_FIELD)
                                .minDocCount(1)
                                .backgroundFilter(FilterBuilders.termFilter(CLASS_FIELD, "1"))
-                                .significanceHeuristic(new MutualInformation.MutualInformationBuilder(true, false))))
+                                .significanceHeuristic(significanceHeuristicExpectingSeparateSets)))
                .addAggregation((new FilterAggregationBuilder("1"))
                        .filter(FilterBuilders.termFilter(CLASS_FIELD, "1"))
                        .subAggregation(new SignificantTermsBuilder("sig_terms")
                                .field(TEXT_FIELD)
                                .minDocCount(1)
                                .backgroundFilter(FilterBuilders.termFilter(CLASS_FIELD, "0"))
-                                .significanceHeuristic(new MutualInformation.MutualInformationBuilder(true, false))))
+                                .significanceHeuristic(significanceHeuristicExpectingSeparateSets)))
                .execute()
                .actionGet();

@ -302,7 +309,7 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
        assertThat(sigTerms0.getBuckets().size(), equalTo(2));
        double score00Background = sigTerms0.getBucketByKey("0").getSignificanceScore();
        double score01Background = sigTerms0.getBucketByKey("1").getSignificanceScore();
-        SignificantTerms sigTerms1 = ((SignificantTerms) (((StringTerms) response1.getAggregations().get("class")).getBucketByKey("0").getAggregations().asMap().get("sig_terms")));
+        SignificantTerms sigTerms1 = ((SignificantTerms) (((StringTerms) response1.getAggregations().get("class")).getBucketByKey("1").getAggregations().asMap().get("sig_terms")));
        double score10Background = sigTerms1.getBucketByKey("0").getSignificanceScore();
        double score11Background = sigTerms1.getBucketByKey("1").getSignificanceScore();

@ -340,14 +347,20 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
    }

    @Test
-    public void testMutualInformationEqual() throws Exception {
+    public void testScoresEqualForPositiveAndNegative() throws Exception {
        indexEqualTestData();
-        //now, check that results for both classes are the same with exclude negatives = false and classes are routing ids
+        testScoresEqualForPositiveAndNegative(new MutualInformation.MutualInformationBuilder(true, true));
+        testScoresEqualForPositiveAndNegative(new ChiSquare.ChiSquareBuilder(true, true));
+    }
+
+    public void testScoresEqualForPositiveAndNegative(SignificanceHeuristicBuilder heuristic) throws Exception {
+
+        //check that results for both classes are the same with exclude negatives = false and classes are routing ids
        SearchResponse response = client().prepareSearch("test")
                .addAggregation(new TermsBuilder("class").field("class").subAggregation(new SignificantTermsBuilder("mySignificantTerms")
                        .field("text")
                        .executionHint(randomExecutionHint())
-                        .significanceHeuristic(new MutualInformation.MutualInformationBuilder(true, true))
+                        .significanceHeuristic(heuristic)
                        .minDocCount(1).shardSize(1000).size(1000)))
                .execute()
                .actionGet();
--- a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java
+++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java
@ -29,6 +29,8 @@ import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms
 import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms.Bucket;
 import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsAggregatorFactory.ExecutionMode;
 import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsBuilder;
+import org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare;
+import org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND;
 import org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore;
 import org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation;
 import org.elasticsearch.search.aggregations.bucket.terms.Terms;
@ -186,8 +188,38 @@ public class SignificantTermsTests extends ElasticsearchIntegrationTest {
        assertSearchResponse(response);
        SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
        checkExpectedStringTermsFound(topTerms);
-    }   
-    
+    }
+
+    @Test
+    public void textAnalysisGND() throws Exception {
+        SearchResponse response = client().prepareSearch("test")
+                .setSearchType(SearchType.QUERY_AND_FETCH)
+                .setQuery(new TermQueryBuilder("_all", "terje"))
+                .setFrom(0).setSize(60).setExplain(true)
+                .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description").executionHint(randomExecutionHint()).significanceHeuristic(new GND.GNDBuilder(true))
+                        .minDocCount(2))
+                .execute()
+                .actionGet();
+        assertSearchResponse(response);
+        SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
+        checkExpectedStringTermsFound(topTerms);
+    }
+
+    @Test
+    public void textAnalysisChiSquare() throws Exception {
+        SearchResponse response = client().prepareSearch("test")
+                .setSearchType(SearchType.QUERY_AND_FETCH)
+                .setQuery(new TermQueryBuilder("_all", "terje"))
+                .setFrom(0).setSize(60).setExplain(true)
+                .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description").executionHint(randomExecutionHint()).significanceHeuristic(new ChiSquare.ChiSquareBuilder(false,true))
+                        .minDocCount(2))
+                .execute()
+                .actionGet();
+        assertSearchResponse(response);
+        SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
+        checkExpectedStringTermsFound(topTerms);
+    }
+
    @Test
    public void badFilteredAnalysis() throws Exception {
        // Deliberately using a bad choice of filter here for the background context in order
--- a/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java
+++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java
@ -39,8 +39,10 @@ import org.junit.Test;

 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;

 import static org.hamcrest.Matchers.*;
@ -66,6 +68,8 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
    public void streamResponse() throws Exception {
        SignificanceHeuristicStreams.registerStream(MutualInformation.STREAM, MutualInformation.STREAM.getName());
        SignificanceHeuristicStreams.registerStream(JLHScore.STREAM, JLHScore.STREAM.getName());
+        SignificanceHeuristicStreams.registerStream(GND.STREAM, GND.STREAM.getName());
+        SignificanceHeuristicStreams.registerStream(ChiSquare.STREAM, ChiSquare.STREAM.getName());
        Version version = ElasticsearchIntegrationTest.randomVersion();
        InternalSignificantTerms[] sigTerms = getRandomSignificantTerms(getRandomSignificanceheuristic());

@ -109,11 +113,12 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
    }

    SignificanceHeuristic getRandomSignificanceheuristic() {
-        if (randomBoolean()) {
-            return JLHScore.INSTANCE;
-        } else {
-            return new MutualInformation(randomBoolean(), true);
-        }
+        List<SignificanceHeuristic> heuristics = new ArrayList<>();
+        heuristics.add(JLHScore.INSTANCE);
+        heuristics.add(new MutualInformation(randomBoolean(), randomBoolean()));
+        heuristics.add(new GND(randomBoolean()));
+        heuristics.add(new ChiSquare(randomBoolean(), randomBoolean()));
+        return heuristics.get(randomInt(3));
    }

    // test that
@ -125,110 +130,111 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
        Set<SignificanceHeuristicParser> parsers = new HashSet<>();
        parsers.add(new JLHScore.JLHScoreParser());
        parsers.add(new MutualInformation.MutualInformationParser());
+        parsers.add(new GND.GNDParser());
+        parsers.add(new ChiSquare.ChiSquareParser());
        SignificanceHeuristicParserMapper heuristicParserMapper = new SignificanceHeuristicParserMapper(parsers);
        SearchContext searchContext = new SignificantTermsTestSearchContext();

-        // test default with string
-        XContentParser stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", \"jlh\":{}, \"min_doc_count\":200}");
+        // test jlh with string
+        assertTrue(parseFromString(heuristicParserMapper, searchContext, "\"jlh\":{}") instanceof JLHScore);
+        // test gnd with string
+        assertTrue(parseFromString(heuristicParserMapper, searchContext, "\"gnd\":{}") instanceof GND);
+        // test mutual information with string
+        boolean includeNegatives = randomBoolean();
+        boolean backgroundIsSuperset = randomBoolean();
+        assertThat(parseFromString(heuristicParserMapper, searchContext, "\"mutual_information\":{\"include_negatives\": " + includeNegatives + ", \"background_is_superset\":" + backgroundIsSuperset + "}"), equalTo((SignificanceHeuristic) (new MutualInformation(includeNegatives, backgroundIsSuperset))));
+        assertThat(parseFromString(heuristicParserMapper, searchContext, "\"chi_square\":{\"include_negatives\": " + includeNegatives + ", \"background_is_superset\":" + backgroundIsSuperset + "}"), equalTo((SignificanceHeuristic) (new ChiSquare(includeNegatives, backgroundIsSuperset))));
+
+        // test with builders
+        assertTrue(parseFromBuilder(heuristicParserMapper, searchContext, new JLHScore.JLHScoreBuilder()) instanceof JLHScore);
+        assertTrue(parseFromBuilder(heuristicParserMapper, searchContext, new GND.GNDBuilder(backgroundIsSuperset)) instanceof GND);
+        assertThat(parseFromBuilder(heuristicParserMapper, searchContext, new MutualInformation.MutualInformationBuilder(includeNegatives, backgroundIsSuperset)), equalTo((SignificanceHeuristic) new MutualInformation(includeNegatives, backgroundIsSuperset)));
+        assertThat(parseFromBuilder(heuristicParserMapper, searchContext, new ChiSquare.ChiSquareBuilder(includeNegatives, backgroundIsSuperset)), equalTo((SignificanceHeuristic) new ChiSquare(includeNegatives, backgroundIsSuperset)));
+
+        // test exceptions
+        String faultyHeuristicdefinition = "\"mutual_information\":{\"include_negatives\": false, \"some_unknown_field\": false}";
+        String expectedError = "unknown for mutual_information";
+        checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError);
+
+        faultyHeuristicdefinition = "\"chi_square\":{\"unknown_field\": true}";
+        expectedError = "unknown for chi_square";
+        checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError);
+
+        faultyHeuristicdefinition = "\"jlh\":{\"unknown_field\": true}";
+        expectedError = "expected }, got ";
+        checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError);
+
+        faultyHeuristicdefinition = "\"gnd\":{\"unknown_field\": true}";
+        expectedError = "unknown for gnd";
+        checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError);
+    }
+
+    protected void checkParseException(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, String faultyHeuristicDefinition, String expectedError) throws IOException {
+        try {
+            XContentParser stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", " + faultyHeuristicDefinition + ",\"min_doc_count\":200}");
+            stParser.nextToken();
+            new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
+            fail();
+        } catch (ElasticsearchParseException e) {
+            assertTrue(e.getMessage().contains(expectedError));
+        }
+    }
+
+    protected SignificanceHeuristic parseFromBuilder(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, SignificanceHeuristicBuilder significanceHeuristicBuilder) throws IOException {
+        SignificantTermsBuilder stBuilder = new SignificantTermsBuilder("testagg");
+        stBuilder.significanceHeuristic(significanceHeuristicBuilder).field("text").minDocCount(200);
+        XContentBuilder stXContentBuilder = XContentFactory.jsonBuilder();
+        stBuilder.internalXContent(stXContentBuilder, null);
+        XContentParser stParser = JsonXContent.jsonXContent.createParser(stXContentBuilder.string());
+        return parseSignificanceHeuristic(heuristicParserMapper, searchContext, stParser);
+    }
+
+    private SignificanceHeuristic parseSignificanceHeuristic(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, XContentParser stParser) throws IOException {
        stParser.nextToken();
        SignificantTermsAggregatorFactory aggregatorFactory = (SignificantTermsAggregatorFactory) new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
        stParser.nextToken();
        assertThat(aggregatorFactory.getBucketCountThresholds().getMinDocCount(), equalTo(200l));
        assertThat(stParser.currentToken(), equalTo(null));
        stParser.close();
-
-        // test default with builders
-        SignificantTermsBuilder stBuilder = new SignificantTermsBuilder("testagg");
-        stBuilder.significanceHeuristic(new JLHScore.JLHScoreBuilder()).field("text").minDocCount(200);
-        XContentBuilder stXContentBuilder = XContentFactory.jsonBuilder();
-        stBuilder.internalXContent(stXContentBuilder, null);
-        stParser = JsonXContent.jsonXContent.createParser(stXContentBuilder.string());
-        stParser.nextToken();
-        aggregatorFactory = (SignificantTermsAggregatorFactory) new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
-        stParser.nextToken();
-        assertThat(aggregatorFactory.getBucketCountThresholds().getMinDocCount(), equalTo(200l));
-        assertThat(stParser.currentToken(), equalTo(null));
-        stParser.close();
-
-        // test mutual_information with string
-        stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", \"mutual_information\":{\"include_negatives\": false}, \"min_doc_count\":200}");
-        stParser.nextToken();
-        aggregatorFactory = (SignificantTermsAggregatorFactory) new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
-        stParser.nextToken();
-        assertThat(aggregatorFactory.getBucketCountThresholds().getMinDocCount(), equalTo(200l));
-        assertTrue(!((MutualInformation) aggregatorFactory.getSignificanceHeuristic()).getIncludeNegatives());
-        assertThat(stParser.currentToken(), equalTo(null));
-        stParser.close();
-
-        // test mutual_information with builders
-        stBuilder = new SignificantTermsBuilder("testagg");
-        stBuilder.significanceHeuristic(new MutualInformation.MutualInformationBuilder(false, true)).field("text").minDocCount(200);
-        stXContentBuilder = XContentFactory.jsonBuilder();
-        stBuilder.internalXContent(stXContentBuilder, null);
-        stParser = JsonXContent.jsonXContent.createParser(stXContentBuilder.string());
-        stParser.nextToken();
-        aggregatorFactory = (SignificantTermsAggregatorFactory) new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
-        stParser.nextToken();
-        assertThat(aggregatorFactory.getBucketCountThresholds().getMinDocCount(), equalTo(200l));
-        assertTrue(!((MutualInformation) aggregatorFactory.getSignificanceHeuristic()).getIncludeNegatives());
-        assertThat(stParser.currentToken(), equalTo(null));
-        stParser.close();
-
-        // test exceptions
-        try {
-            // 1. invalid field
-            stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", \"mutual_information\":{\"include_negatives\": false, \"some_unknown_field\": false}\"min_doc_count\":200}");
-            stParser.nextToken();
-            new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
-            fail();
-        } catch (ElasticsearchParseException e) {
-            assertTrue(e.getMessage().contains("unknown for mutual_information"));
-        }
-
-        try {
-            // 2. unknown field in jlh_score
-            stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", \"jlh\":{\"unknown_field\": true}, \"min_doc_count\":200}");
-            stParser.nextToken();
-            new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
-            fail();
-        } catch (ElasticsearchParseException e) {
-            assertTrue(e.getMessage().contains("expected }, got "));
-        }
+        return aggregatorFactory.getSignificanceHeuristic();
    }

-    @Test
-    public void testAssertions() throws Exception {
-        MutualInformation mutualInformation = new MutualInformation(true, true);
+    protected SignificanceHeuristic parseFromString(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, String heuristicString) throws IOException {
+        XContentParser stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", " + heuristicString + ", \"min_doc_count\":200}");
+        return parseSignificanceHeuristic(heuristicParserMapper, searchContext, stParser);
+    }
+
+    void testBackgroundAssertions(SignificanceHeuristic heuristicIsSuperset, SignificanceHeuristic heuristicNotSuperset) {
        try {
-            mutualInformation.getScore(2, 3, 1, 4);
+            heuristicIsSuperset.getScore(2, 3, 1, 4);
            fail();
        } catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
            assertNotNull(illegalArgumentException.getMessage());
            assertTrue(illegalArgumentException.getMessage().contains("subsetFreq > supersetFreq"));
        }
        try {
-            mutualInformation.getScore(1, 4, 2, 3);
+            heuristicIsSuperset.getScore(1, 4, 2, 3);
            fail();
        } catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
            assertNotNull(illegalArgumentException.getMessage());
            assertTrue(illegalArgumentException.getMessage().contains("subsetSize > supersetSize"));
        }
        try {
-            mutualInformation.getScore(2, 1, 3, 4);
+            heuristicIsSuperset.getScore(2, 1, 3, 4);
            fail();
        } catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
            assertNotNull(illegalArgumentException.getMessage());
            assertTrue(illegalArgumentException.getMessage().contains("subsetFreq > subsetSize"));
        }
        try {
-            mutualInformation.getScore(1, 2, 4, 3);
+            heuristicIsSuperset.getScore(1, 2, 4, 3);
            fail();
        } catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
            assertNotNull(illegalArgumentException.getMessage());
            assertTrue(illegalArgumentException.getMessage().contains("supersetFreq > supersetSize"));
        }
        try {
-            mutualInformation.getScore(1, 3, 4, 4);
+            heuristicIsSuperset.getScore(1, 3, 4, 4);
            fail();
        } catch (ElasticsearchIllegalArgumentException assertionError) {
            assertNotNull(assertionError.getMessage());
@ -238,70 +244,58 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
            int idx = randomInt(3);
            long[] values = {1, 2, 3, 4};
            values[idx] *= -1;
-            mutualInformation.getScore(values[0], values[1], values[2], values[3]);
+            heuristicIsSuperset.getScore(values[0], values[1], values[2], values[3]);
            fail();
        } catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
            assertNotNull(illegalArgumentException.getMessage());
            assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive"));
        }
-        mutualInformation = new MutualInformation(true, false);
-        double score = mutualInformation.getScore(2, 3, 1, 4);
-        assertThat(score, greaterThanOrEqualTo(0.0));
-        assertThat(score, lessThanOrEqualTo(1.0));
-        score = mutualInformation.getScore(1, 4, 2, 3);
-        assertThat(score, greaterThanOrEqualTo(0.0));
-        assertThat(score, lessThanOrEqualTo(1.0));
-
        try {
-            mutualInformation.getScore(2, 1, 3, 4);
+            heuristicNotSuperset.getScore(2, 1, 3, 4);
            fail();
        } catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
            assertNotNull(illegalArgumentException.getMessage());
            assertTrue(illegalArgumentException.getMessage().contains("subsetFreq > subsetSize"));
        }
        try {
-            mutualInformation.getScore(1, 2, 4, 3);
-            fail();
-        } catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
-            assertNotNull(illegalArgumentException.getMessage());
-            assertTrue(illegalArgumentException.getMessage().contains("supersetFreq > supersetSize"));
-        }
-
-        score = mutualInformation.getScore(1, 3, 4, 4);
-        assertThat(score, greaterThanOrEqualTo(0.0));
-        assertThat(score, lessThanOrEqualTo(1.0));
-
-        try {
-            int idx = randomInt(3);
-            long[] values = {1, 2, 3, 4};
-            values[idx] *= -1;
-            mutualInformation.getScore(values[0], values[1], values[2], values[3]);
-            fail();
-        } catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
-            assertNotNull(illegalArgumentException.getMessage());
-            assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive"));
-        }
-
-        JLHScore jlhScore = JLHScore.INSTANCE;
-        try {
-            int idx = randomInt(3);
-            long[] values = {1, 2, 3, 4};
-            values[idx] *= -1;
-            jlhScore.getScore(values[0], values[1], values[2], values[3]);
-            fail();
-        } catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
-            assertNotNull(illegalArgumentException.getMessage());
-            assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive"));
-        }
-        try {
-            jlhScore.getScore(1, 2, 4, 3);
+            heuristicNotSuperset.getScore(1, 2, 4, 3);
            fail();
        } catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
            assertNotNull(illegalArgumentException.getMessage());
            assertTrue(illegalArgumentException.getMessage().contains("supersetFreq > supersetSize"));
        }
        try {
-            jlhScore.getScore(2, 1, 3, 4);
+            int idx = randomInt(3);
+            long[] values = {1, 2, 3, 4};
+            values[idx] *= -1;
+            heuristicNotSuperset.getScore(values[0], values[1], values[2], values[3]);
+            fail();
+        } catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
+            assertNotNull(illegalArgumentException.getMessage());
+            assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive"));
+        }
+    }
+
+    void testAssertions(SignificanceHeuristic heuristic) {
+        try {
+            int idx = randomInt(3);
+            long[] values = {1, 2, 3, 4};
+            values[idx] *= -1;
+            heuristic.getScore(values[0], values[1], values[2], values[3]);
+            fail();
+        } catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
+            assertNotNull(illegalArgumentException.getMessage());
+            assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive"));
+        }
+        try {
+            heuristic.getScore(1, 2, 4, 3);
+            fail();
+        } catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
+            assertNotNull(illegalArgumentException.getMessage());
+            assertTrue(illegalArgumentException.getMessage().contains("supersetFreq > supersetSize"));
+        }
+        try {
+            heuristic.getScore(2, 1, 3, 4);
            fail();
        } catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
            assertNotNull(illegalArgumentException.getMessage());
@ -310,11 +304,30 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
    }

    @Test
-    public void scoreDefault() {
-        SignificanceHeuristic heuristic = JLHScore.INSTANCE;
+    public void testAssertions() throws Exception {
+        testBackgroundAssertions(new MutualInformation(true, true), new MutualInformation(true, false));
+        testBackgroundAssertions(new ChiSquare(true, true), new ChiSquare(true, false));
+        testBackgroundAssertions(new GND(true), new GND(false));
+        testAssertions(JLHScore.INSTANCE);
+    }
+
+    @Test
+    public void basicScoreProperties() {
+        basicScoreProperties(JLHScore.INSTANCE, true);
+        basicScoreProperties(new GND(true), true);
+        basicScoreProperties(new MutualInformation(true, true), false);
+        basicScoreProperties(new ChiSquare(true, true), false);
+    }
+
+    public void basicScoreProperties(SignificanceHeuristic heuristic, boolean test0) {
+
        assertThat(heuristic.getScore(1, 1, 1, 3), greaterThan(0.0));
        assertThat(heuristic.getScore(1, 1, 2, 3), lessThan(heuristic.getScore(1, 1, 1, 3)));
-        assertThat(heuristic.getScore(0, 1, 2, 3), equalTo(0.0));
+        assertThat(heuristic.getScore(1, 1, 3, 4), lessThan(heuristic.getScore(1, 1, 2, 4)));
+        if (test0) {
+            assertThat(heuristic.getScore(0, 1, 2, 3), equalTo(0.0));
+        }
+
        double score = 0.0;
        try {
            long a = randomLong();
@ -350,7 +363,34 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
        assertThat(score, lessThanOrEqualTo(1.0));
        assertThat(score, greaterThanOrEqualTo(0.0));
        heuristic = new MutualInformation(false, true);
-        assertThat(heuristic.getScore(0, 1, 2, 3), equalTo(-1.0 * Double.MAX_VALUE));
+        assertThat(heuristic.getScore(0, 1, 2, 3), equalTo(Double.NEGATIVE_INFINITY));

+        heuristic = new MutualInformation(true, false);
+        score = heuristic.getScore(2, 3, 1, 4);
+        assertThat(score, greaterThanOrEqualTo(0.0));
+        assertThat(score, lessThanOrEqualTo(1.0));
+        score = heuristic.getScore(1, 4, 2, 3);
+        assertThat(score, greaterThanOrEqualTo(0.0));
+        assertThat(score, lessThanOrEqualTo(1.0));
+        score = heuristic.getScore(1, 3, 4, 4);
+        assertThat(score, greaterThanOrEqualTo(0.0));
+        assertThat(score, lessThanOrEqualTo(1.0));
+    }
+
+    @Test
+    public void testGNDCornerCases() throws Exception {
+        GND gnd = new GND(true);
+        //term is only in the subset, not at all in the other set but that is because the other set is empty.
+        // this should actually not happen because only terms that are in the subset are considered now,
+        // however, in this case the score should be 0 because a term that does not exist cannot be relevant...
+        assertThat(gnd.getScore(0, randomIntBetween(1, 2), 0, randomIntBetween(2,3)), equalTo(0.0));
+        // the terms do not co-occur at all - should be 0
+        assertThat(gnd.getScore(0, randomIntBetween(1, 2), randomIntBetween(2, 3), randomIntBetween(5,6)), equalTo(0.0));
+        // comparison between two terms that do not exist - probably not relevant
+        assertThat(gnd.getScore(0, 0, 0, randomIntBetween(1,2)), equalTo(0.0));
+        // terms co-occur perfectly - should be 1
+        assertThat(gnd.getScore(1, 1, 1, 1), equalTo(1.0));
+        gnd = new GND(false);
+        assertThat(gnd.getScore(0, 0, 0, 0), equalTo(0.0));
    }
 }