significant terms: add google normalized distance, add chi square

closes #6858
This commit is contained in:
Britta Weber 2014-07-12 11:53:08 +02:00
parent 5f0719fd50
commit a3cefd919e
11 changed files with 758 additions and 301 deletions

View File

@ -306,7 +306,45 @@ Per default, the assumption is that the documents in the bucket are also contain
"background_is_superset": false
--------------------------------------------------
===== Chi square
added[1.4.0]
Chi square as described in "Information Retrieval", Manning et al., Chapter 13.5.2 can be used as significance score by adding the parameter
[source,js]
--------------------------------------------------
"chi_square": {
}
--------------------------------------------------
Chi square behaves like mutual information and can be configured with the same parameters `include_negatives` and `background_is_superset`.
===== google normalized distance
added[1.4.0]
Google normalized distance as described in "The Google Similarity Distance", Cilibrasi and Vitanyi, 2007 (http://arxiv.org/pdf/cs/0412098v3.pdf) can be used as significance score by adding the parameter
[source,js]
--------------------------------------------------
"gnd": {
}
--------------------------------------------------
`gnd` also accepts the `background_is_superset` parameter.
===== Which one is best?
Roughly, `mutual_information` prefers high frequent terms even if they occur also frequently in the background. For example, in an analysis of natural language text this might lead to selection of stop words. `mutual_information` is unlikely to select very rare terms like misspellings. `gnd` prefers terms with a high co-occurence and avoids selection of stopwords. It might be better suited for synonym detection. However, `gnd` has a tendency to select very rare terms that are, for example, a result of misspelling. `chi_square` and `jlh` are somewhat in-between.
It is hard to say which one of the different heuristics will be the best choice as it depends on what the significant terms are used for (see for example [Yang and Pedersen, "A Comparative Study on Feature Selection in Text Categorization", 1997](http://courses.ischool.berkeley.edu/i256/f06/papers/yang97comparative.pdf) for a study on using significant terms for feature selection for text classification).
===== Size & Shard Size

View File

@ -0,0 +1,115 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.xcontent.XContentBuilder;
import java.io.IOException;
public class ChiSquare extends NXYSignificanceHeuristic {
protected static final ParseField NAMES_FIELD = new ParseField("chi_square");
public ChiSquare(boolean includeNegatives, boolean backgroundIsSuperset) {
super(includeNegatives, backgroundIsSuperset);
}
@Override
public boolean equals(Object other) {
if (!(other instanceof ChiSquare)) {
return false;
}
return super.equals(other);
}
@Override
public int hashCode() {
int result = NAMES_FIELD.getPreferredName().hashCode();
result = 31 * result + super.hashCode();
return result;
}
public static final SignificanceHeuristicStreams.Stream STREAM = new SignificanceHeuristicStreams.Stream() {
@Override
public SignificanceHeuristic readResult(StreamInput in) throws IOException {
return new ChiSquare(in.readBoolean(), in.readBoolean());
}
@Override
public String getName() {
return NAMES_FIELD.getPreferredName();
}
};
/**
* Calculates Chi^2
* see "Information Retrieval", Manning et al., Eq. 13.19
*/
@Override
public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
Frequencies frequencies = computeNxys(subsetFreq, subsetSize, supersetFreq, supersetSize, "ChiSquare");
// here we check if the term appears more often in subset than in background without subset.
if (!includeNegatives && frequencies.N11 / frequencies.N_1 < frequencies.N10 / frequencies.N_0) {
return Double.NEGATIVE_INFINITY;
}
return (frequencies.N * Math.pow((frequencies.N11 * frequencies.N00 - frequencies.N01 * frequencies.N10), 2.0) /
((frequencies.N_1) * (frequencies.N1_) * (frequencies.N0_) * (frequencies.N_0)));
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(STREAM.getName());
super.writeTo(out);
}
public static class ChiSquareParser extends NXYParser {
@Override
protected SignificanceHeuristic newHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) {
return new ChiSquare(includeNegatives, backgroundIsSuperset);
}
@Override
public String[] getNames() {
return NAMES_FIELD.getAllNamesIncludedDeprecated();
}
}
public static class ChiSquareBuilder extends NXYSignificanceHeuristic.NXYBuilder {
public ChiSquareBuilder(boolean includeNegatives, boolean backgroundIsSuperset) {
super(includeNegatives, backgroundIsSuperset);
}
@Override
public void toXContent(XContentBuilder builder) throws IOException {
builder.startObject(STREAM.getName());
super.build(builder);
builder.endObject();
}
}
}

View File

@ -0,0 +1,148 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.query.QueryParsingException;
import java.io.IOException;
public class GND extends NXYSignificanceHeuristic {
protected static final ParseField NAMES_FIELD = new ParseField("gnd");
public GND(boolean backgroundIsSuperset) {
super(true, backgroundIsSuperset);
}
@Override
public boolean equals(Object other) {
if (!(other instanceof GND)) {
return false;
}
return super.equals(other);
}
@Override
public int hashCode() {
int result = NAMES_FIELD.getPreferredName().hashCode();
result = 31 * result + super.hashCode();
return result;
}
public static final SignificanceHeuristicStreams.Stream STREAM = new SignificanceHeuristicStreams.Stream() {
@Override
public SignificanceHeuristic readResult(StreamInput in) throws IOException {
return new GND(in.readBoolean());
}
@Override
public String getName() {
return NAMES_FIELD.getPreferredName();
}
};
/**
* Calculates Google Normalized Distance, as described in "The Google Similarity Distance", Cilibrasi and Vitanyi, 2007
* link: http://arxiv.org/pdf/cs/0412098v3.pdf
*/
@Override
public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
Frequencies frequencies = computeNxys(subsetFreq, subsetSize, supersetFreq, supersetSize, "GND");
double fx = frequencies.N1_;
double fy = frequencies.N_1;
double fxy = frequencies.N11;
double N = frequencies.N;
if (fxy == 0) {
// no co-occurrence
return 0.0;
}
if ((fx == fy) && (fx == fxy)) {
// perfect co-occurrence
return 1.0;
}
double score = (Math.max(Math.log(fx), Math.log(fy)) - Math.log(fxy)) /
(Math.log(N) - Math.min(Math.log(fx), Math.log(fy)));
//we must invert the order of terms because GND scores relevant terms low
score = Math.exp(-1.0d * score);
return score;
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(STREAM.getName());
out.writeBoolean(backgroundIsSuperset);
}
public static class GNDParser extends NXYParser {
@Override
public String[] getNames() {
return NAMES_FIELD.getAllNamesIncludedDeprecated();
}
protected SignificanceHeuristic newHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) {
return new GND(backgroundIsSuperset);
}
@Override
public SignificanceHeuristic parse(XContentParser parser) throws IOException, QueryParsingException {
String givenName = parser.currentName();
boolean backgroundIsSuperset = true;
XContentParser.Token token = parser.nextToken();
while (!token.equals(XContentParser.Token.END_OBJECT)) {
if (BACKGROUND_IS_SUPERSET.match(parser.currentName(), ParseField.EMPTY_FLAGS)) {
parser.nextToken();
backgroundIsSuperset = parser.booleanValue();
} else {
throw new ElasticsearchParseException("Field " + parser.currentName().toString() + " unknown for " + givenName);
}
token = parser.nextToken();
}
return newHeuristic(true, backgroundIsSuperset);
}
}
public static class GNDBuilder extends NXYBuilder {
public GNDBuilder(boolean backgroundIsSuperset) {
super(true, backgroundIsSuperset);
}
@Override
public void toXContent(XContentBuilder builder) throws IOException {
builder.startObject(STREAM.getName());
builder.field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset);
builder.endObject();
}
}
}

View File

@ -31,7 +31,7 @@ import org.elasticsearch.index.query.QueryParsingException;
import java.io.IOException;
public class JLHScore implements SignificanceHeuristic {
public class JLHScore extends SignificanceHeuristic {
public static final JLHScore INSTANCE = new JLHScore();
@ -59,25 +59,10 @@ public class JLHScore implements SignificanceHeuristic {
* Calculates the significance of a term in a sample against a background of
* normal distributions by comparing the changes in frequency. This is the heart
* of the significant terms feature.
* <p/>
*
* @param subsetFreq The frequency of the term in the selected sample
* @param subsetSize The size of the selected sample (typically number of docs)
* @param supersetFreq The frequency of the term in the superset from which the sample was taken
* @param supersetSize The size of the superset from which the sample was taken (typically number of docs)
* @return a "significance" score
*/
@Override
public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
if (subsetFreq < 0 || subsetSize < 0 || supersetFreq < 0 || supersetSize < 0) {
throw new ElasticsearchIllegalArgumentException("Frequencies of subset and superset must be positive in JLHScore.getScore()");
}
if (subsetFreq > subsetSize) {
throw new ElasticsearchIllegalArgumentException("subsetFreq > subsetSize, in JLHScore.score(..)");
}
if (supersetFreq > supersetSize) {
throw new ElasticsearchIllegalArgumentException("supersetFreq > supersetSize, in JLHScore.score(..)");
}
checkFrequencyValidity(subsetFreq, subsetSize, supersetFreq, supersetSize, "JLHScore");
if ((subsetSize == 0) || (supersetSize == 0)) {
// avoid any divide by zero issues
return 0;

View File

@ -21,50 +21,36 @@
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.query.QueryParsingException;
import java.io.IOException;
public class MutualInformation implements SignificanceHeuristic {
public class MutualInformation extends NXYSignificanceHeuristic {
protected static final ParseField NAMES_FIELD = new ParseField("mutual_information");
protected static final ParseField INCLUDE_NEGATIVES_FIELD = new ParseField("include_negatives");
protected static final ParseField BACKGROUND_IS_SUPERSET = new ParseField("background_is_superset");
protected static final String SCORE_ERROR_MESSAGE = ", does your background filter not include all documents in the bucket? If so and it is intentional, set \"" + BACKGROUND_IS_SUPERSET.getPreferredName() + "\": false";
private static final double log2 = Math.log(2.0);
/**
* Mutual information does not differentiate between terms that are descriptive for subset or for
* the background without the subset. We might want to filter out the terms that are appear much less often
* in the subset than in the background without the subset.
*/
protected boolean includeNegatives = false;
private boolean backgroundIsSuperset = true;
private MutualInformation() {};
public MutualInformation(boolean includeNegatives, boolean backgroundIsSuperset) {
this.includeNegatives = includeNegatives;
this.backgroundIsSuperset = backgroundIsSuperset;
super(includeNegatives, backgroundIsSuperset);
}
@Override
public boolean equals(Object other) {
if (! (other instanceof MutualInformation)) {
if (!(other instanceof MutualInformation)) {
return false;
}
return ((MutualInformation)other).includeNegatives == includeNegatives && ((MutualInformation)other).backgroundIsSuperset == backgroundIsSuperset;
return super.equals(other);
}
@Override
public int hashCode() {
int result = NAMES_FIELD.getPreferredName().hashCode();
result = 31 * result + super.hashCode();
return result;
}
public static final SignificanceHeuristicStreams.Stream STREAM = new SignificanceHeuristicStreams.Stream() {
@ -82,88 +68,23 @@ public class MutualInformation implements SignificanceHeuristic {
/**
* Calculates mutual information
* see "Information Retrieval", Manning et al., Eq. 13.17
*
* @param subsetFreq The frequency of the term in the selected sample
* @param subsetSize The size of the selected sample (typically number of docs)
* @param supersetFreq The frequency of the term in the superset from which the sample was taken
* @param supersetSize The size of the superset from which the sample was taken (typically number of docs)
* @return a "significance" score
*/
@Override
public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
if (subsetFreq < 0 || subsetSize < 0 || supersetFreq < 0 || supersetSize < 0) {
throw new ElasticsearchIllegalArgumentException("Frequencies of subset and superset must be positive in MutualInformation.getScore()");
}
if (subsetFreq > subsetSize) {
throw new ElasticsearchIllegalArgumentException("subsetFreq > subsetSize, in MutualInformation.score(..)");
}
if (supersetFreq > supersetSize) {
throw new ElasticsearchIllegalArgumentException("supersetFreq > supersetSize, in MutualInformation.score(..)");
}
if (backgroundIsSuperset) {
if (subsetFreq > supersetFreq) {
throw new ElasticsearchIllegalArgumentException("subsetFreq > supersetFreq" + SCORE_ERROR_MESSAGE);
}
if (subsetSize > supersetSize) {
throw new ElasticsearchIllegalArgumentException("subsetSize > supersetSize" + SCORE_ERROR_MESSAGE);
}
if (supersetFreq - subsetFreq > supersetSize - subsetSize) {
throw new ElasticsearchIllegalArgumentException("supersetFreq - subsetFreq > supersetSize - subsetSize" + SCORE_ERROR_MESSAGE);
}
}
double N00, N01, N10, N11, N0_, N1_, N_0, N_1, N;
if (backgroundIsSuperset) {
//documents not in class and do not contain term
N00 = supersetSize - supersetFreq - (subsetSize - subsetFreq);
//documents in class and do not contain term
N01 = (subsetSize - subsetFreq);
// documents not in class and do contain term
N10 = supersetFreq - subsetFreq;
// documents in class and do contain term
N11 = subsetFreq;
//documents that do not contain term
N0_ = supersetSize - supersetFreq;
//documents that contain term
N1_ = supersetFreq;
//documents that are not in class
N_0 = supersetSize - subsetSize;
//documents that are in class
N_1 = subsetSize;
//all docs
N = supersetSize;
} else {
//documents not in class and do not contain term
N00 = supersetSize - supersetFreq;
//documents in class and do not contain term
N01 = subsetSize - subsetFreq;
// documents not in class and do contain term
N10 = supersetFreq;
// documents in class and do contain term
N11 = subsetFreq;
//documents that do not contain term
N0_ = supersetSize - supersetFreq + subsetSize - subsetFreq;
//documents that contain term
N1_ = supersetFreq + subsetFreq;
//documents that are not in class
N_0 = supersetSize;
//documents that are in class
N_1 = subsetSize;
//all docs
N = supersetSize + subsetSize;
}
Frequencies frequencies = computeNxys(subsetFreq, subsetSize, supersetFreq, supersetSize, "MutualInformation");
double score = (getMITerm(N00, N0_, N_0, N) +
getMITerm(N01, N0_, N_1, N) +
getMITerm(N10, N1_, N_0, N) +
getMITerm(N11, N1_, N_1, N))
double score = (getMITerm(frequencies.N00, frequencies.N0_, frequencies.N_0, frequencies.N) +
getMITerm(frequencies.N01, frequencies.N0_, frequencies.N_1, frequencies.N) +
getMITerm(frequencies.N10, frequencies.N1_, frequencies.N_0, frequencies.N) +
getMITerm(frequencies.N11, frequencies.N1_, frequencies.N_1, frequencies.N))
/ log2;
if (Double.isNaN(score)) {
score = -1.0 * Float.MAX_VALUE;
score = Double.NEGATIVE_INFINITY;
}
// here we check if the term appears more often in subset than in background without subset.
if (!includeNegatives && N11 / N_1 < N10 / N_0) {
score = -1.0 * Double.MAX_VALUE;
if (!includeNegatives && frequencies.N11 / frequencies.N_1 < frequencies.N10 / frequencies.N_0) {
score = Double.NEGATIVE_INFINITY;
}
return score;
}
@ -194,43 +115,13 @@ public class MutualInformation implements SignificanceHeuristic {
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(STREAM.getName());
out.writeBoolean(includeNegatives);
out.writeBoolean(backgroundIsSuperset);
super.writeTo(out);
}
public boolean getIncludeNegatives() {
return includeNegatives;
}
@Override
public int hashCode() {
int result = (includeNegatives ? 1 : 0);
result = 31 * result + (backgroundIsSuperset ? 1 : 0);
return result;
}
public static class MutualInformationParser implements SignificanceHeuristicParser {
public static class MutualInformationParser extends NXYParser {
@Override
public SignificanceHeuristic parse(XContentParser parser) throws IOException, QueryParsingException {
NAMES_FIELD.match(parser.currentName(), ParseField.EMPTY_FLAGS);
boolean includeNegatives = false;
boolean backgroundIsSuperset = true;
XContentParser.Token token = parser.nextToken();
while (!token.equals(XContentParser.Token.END_OBJECT)) {
if (INCLUDE_NEGATIVES_FIELD.match(parser.currentName(), ParseField.EMPTY_FLAGS)) {
parser.nextToken();
includeNegatives = parser.booleanValue();
} else if (BACKGROUND_IS_SUPERSET.match(parser.currentName(), ParseField.EMPTY_FLAGS)) {
parser.nextToken();
backgroundIsSuperset = parser.booleanValue();
} else {
throw new ElasticsearchParseException("Field " + parser.currentName().toString() + " unknown for mutual_information.");
}
token = parser.nextToken();
}
// move to the closing bracket
protected SignificanceHeuristic newHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) {
return new MutualInformation(includeNegatives, backgroundIsSuperset);
}
@ -240,23 +131,17 @@ public class MutualInformation implements SignificanceHeuristic {
}
}
public static class MutualInformationBuilder implements SignificanceHeuristicBuilder {
boolean includeNegatives = true;
private boolean backgroundIsSuperset = true;
private MutualInformationBuilder() {};
public static class MutualInformationBuilder extends NXYBuilder {
public MutualInformationBuilder(boolean includeNegatives, boolean backgroundIsSuperset) {
this.includeNegatives = includeNegatives;
this.backgroundIsSuperset = backgroundIsSuperset;
super(includeNegatives, backgroundIsSuperset);
}
@Override
public void toXContent(XContentBuilder builder) throws IOException {
builder.startObject(STREAM.getName())
.field(INCLUDE_NEGATIVES_FIELD.getPreferredName(), includeNegatives)
.field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset)
.endObject();
builder.startObject(STREAM.getName());
super.build(builder);
builder.endObject();
}
}
}

View File

@ -0,0 +1,180 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.query.QueryParsingException;
import java.io.IOException;
public abstract class NXYSignificanceHeuristic extends SignificanceHeuristic {
protected static final ParseField BACKGROUND_IS_SUPERSET = new ParseField("background_is_superset");
protected static final ParseField INCLUDE_NEGATIVES_FIELD = new ParseField("include_negatives");
protected static final String SCORE_ERROR_MESSAGE = ", does your background filter not include all documents in the bucket? If so and it is intentional, set \"" + BACKGROUND_IS_SUPERSET.getPreferredName() + "\": false";
protected final boolean backgroundIsSuperset;
/**
* Some heuristics do not differentiate between terms that are descriptive for subset or for
* the background without the subset. We might want to filter out the terms that are appear much less often
* in the subset than in the background without the subset.
*/
protected final boolean includeNegatives;
public NXYSignificanceHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) {
this.includeNegatives = includeNegatives;
this.backgroundIsSuperset = backgroundIsSuperset;
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeBoolean(includeNegatives);
out.writeBoolean(backgroundIsSuperset);
}
@Override
public boolean equals(Object other) {
return ((NXYSignificanceHeuristic) other).includeNegatives == includeNegatives && ((NXYSignificanceHeuristic) other).backgroundIsSuperset == backgroundIsSuperset;
}
@Override
public int hashCode() {
int result = (includeNegatives ? 1 : 0);
result = 31 * result + (backgroundIsSuperset ? 1 : 0);
return result;
}
protected static class Frequencies {
double N00, N01, N10, N11, N0_, N1_, N_0, N_1, N;
}
protected Frequencies computeNxys(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize, String scoreFunctionName) {
checkFrequencies(subsetFreq, subsetSize, supersetFreq, supersetSize, scoreFunctionName);
Frequencies frequencies = new Frequencies();
if (backgroundIsSuperset) {
//documents not in class and do not contain term
frequencies.N00 = supersetSize - supersetFreq - (subsetSize - subsetFreq);
//documents in class and do not contain term
frequencies.N01 = (subsetSize - subsetFreq);
// documents not in class and do contain term
frequencies.N10 = supersetFreq - subsetFreq;
// documents in class and do contain term
frequencies.N11 = subsetFreq;
//documents that do not contain term
frequencies.N0_ = supersetSize - supersetFreq;
//documents that contain term
frequencies.N1_ = supersetFreq;
//documents that are not in class
frequencies.N_0 = supersetSize - subsetSize;
//documents that are in class
frequencies.N_1 = subsetSize;
//all docs
frequencies.N = supersetSize;
} else {
//documents not in class and do not contain term
frequencies.N00 = supersetSize - supersetFreq;
//documents in class and do not contain term
frequencies.N01 = subsetSize - subsetFreq;
// documents not in class and do contain term
frequencies.N10 = supersetFreq;
// documents in class and do contain term
frequencies.N11 = subsetFreq;
//documents that do not contain term
frequencies.N0_ = supersetSize - supersetFreq + subsetSize - subsetFreq;
//documents that contain term
frequencies.N1_ = supersetFreq + subsetFreq;
//documents that are not in class
frequencies.N_0 = supersetSize;
//documents that are in class
frequencies.N_1 = subsetSize;
//all docs
frequencies.N = supersetSize + subsetSize;
}
return frequencies;
}
protected void checkFrequencies(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize, String scoreFunctionName) {
checkFrequencyValidity(subsetFreq, subsetSize, supersetFreq, supersetSize, scoreFunctionName);
if (backgroundIsSuperset) {
if (subsetFreq > supersetFreq) {
throw new ElasticsearchIllegalArgumentException("subsetFreq > supersetFreq" + SCORE_ERROR_MESSAGE);
}
if (subsetSize > supersetSize) {
throw new ElasticsearchIllegalArgumentException("subsetSize > supersetSize" + SCORE_ERROR_MESSAGE);
}
if (supersetFreq - subsetFreq > supersetSize - subsetSize) {
throw new ElasticsearchIllegalArgumentException("supersetFreq - subsetFreq > supersetSize - subsetSize" + SCORE_ERROR_MESSAGE);
}
}
}
public static abstract class NXYParser implements SignificanceHeuristicParser {
@Override
public SignificanceHeuristic parse(XContentParser parser) throws IOException, QueryParsingException {
String givenName = parser.currentName();
boolean includeNegatives = false;
boolean backgroundIsSuperset = true;
XContentParser.Token token = parser.nextToken();
while (!token.equals(XContentParser.Token.END_OBJECT)) {
if (INCLUDE_NEGATIVES_FIELD.match(parser.currentName())) {
parser.nextToken();
includeNegatives = parser.booleanValue();
} else if (BACKGROUND_IS_SUPERSET.match(parser.currentName())) {
parser.nextToken();
backgroundIsSuperset = parser.booleanValue();
} else {
throw new ElasticsearchParseException("Field " + parser.currentName().toString() + " unknown for " + givenName);
}
token = parser.nextToken();
}
return newHeuristic(includeNegatives, backgroundIsSuperset);
}
protected abstract SignificanceHeuristic newHeuristic(boolean includeNegatives, boolean backgroundIsSuperset);
}
protected abstract static class NXYBuilder implements SignificanceHeuristicBuilder {
protected boolean includeNegatives = true;
protected boolean backgroundIsSuperset = true;
public NXYBuilder(boolean includeNegatives, boolean backgroundIsSuperset) {
this.includeNegatives = includeNegatives;
this.backgroundIsSuperset = backgroundIsSuperset;
}
protected void build(XContentBuilder builder) throws IOException {
builder.field(INCLUDE_NEGATIVES_FIELD.getPreferredName(), includeNegatives)
.field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset);
}
}
}

View File

@ -20,13 +20,32 @@
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.io.stream.StreamOutput;
import java.io.IOException;
public interface SignificanceHeuristic {
public abstract class SignificanceHeuristic {
/**
* @param subsetFreq The frequency of the term in the selected sample
* @param subsetSize The size of the selected sample (typically number of docs)
* @param supersetFreq The frequency of the term in the superset from which the sample was taken
* @param supersetSize The size of the superset from which the sample was taken (typically number of docs)
* @return a "significance" score
*/
public abstract double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize);
public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize);
abstract public void writeTo(StreamOutput out) throws IOException;
void writeTo(StreamOutput out) throws IOException;
protected void checkFrequencyValidity(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize, String scoreFunctionName) {
if (subsetFreq < 0 || subsetSize < 0 || supersetFreq < 0 || supersetSize < 0) {
throw new ElasticsearchIllegalArgumentException("Frequencies of subset and superset must be positive in " + scoreFunctionName + ".getScore()");
}
if (subsetFreq > subsetSize) {
throw new ElasticsearchIllegalArgumentException("subsetFreq > subsetSize, in JLHScore.score(..)");
}
if (supersetFreq > supersetSize) {
throw new ElasticsearchIllegalArgumentException("supersetFreq > supersetSize, in JLHScore.score(..)");
}
}
}

View File

@ -35,6 +35,8 @@ public class SignificantTermsHeuristicModule extends AbstractModule {
public SignificantTermsHeuristicModule() {
registerHeuristic(JLHScore.JLHScoreParser.class, JLHScore.STREAM);
registerHeuristic(MutualInformation.MutualInformationParser.class, MutualInformation.STREAM);
registerHeuristic(GND.GNDParser.class, GND.STREAM);
registerHeuristic(ChiSquare.ChiSquareParser.class, ChiSquare.STREAM);
}
public void registerHeuristic(Class<? extends SignificanceHeuristicParser> parser, SignificanceHeuristicStreams.Stream stream) {

View File

@ -163,7 +163,7 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
}
}
public static class SimpleHeuristic implements SignificanceHeuristic {
public static class SimpleHeuristic extends SignificanceHeuristic {
protected static final String[] NAMES = {"simple"};
@ -259,15 +259,22 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
}
// compute significance score by
// 1. terms agg on class and significant terms
// 2. filter buckets and set the background to the other class and set is_background false
// both should yield exact same result
@Test
public void testBackgroundVsSeparateSet() throws Exception {
String type = randomBoolean() ? "string" : "long";
String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}";
index01Docs(type, settings);
testBackgroundVsSeparateSet(new MutualInformation.MutualInformationBuilder(true, true), new MutualInformation.MutualInformationBuilder(true, false));
testBackgroundVsSeparateSet(new ChiSquare.ChiSquareBuilder(true, true), new ChiSquare.ChiSquareBuilder(true, false));
testBackgroundVsSeparateSet(new GND.GNDBuilder(true), new GND.GNDBuilder(false));
}
// compute significance score by
// 1. terms agg on class and significant terms
// 2. filter buckets and set the background to the other class and set is_background false
// both should yield exact same result
public void testBackgroundVsSeparateSet(SignificanceHeuristicBuilder significanceHeuristicExpectingSuperset, SignificanceHeuristicBuilder significanceHeuristicExpectingSeparateSets) throws Exception {
SearchResponse response1 = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE)
.addAggregation(new TermsBuilder("class")
.field(CLASS_FIELD)
@ -276,7 +283,7 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
.field(TEXT_FIELD)
.minDocCount(1)
.significanceHeuristic(
new MutualInformation.MutualInformationBuilder(true, true))))
significanceHeuristicExpectingSuperset)))
.execute()
.actionGet();
assertSearchResponse(response1);
@ -287,14 +294,14 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
.field(TEXT_FIELD)
.minDocCount(1)
.backgroundFilter(FilterBuilders.termFilter(CLASS_FIELD, "1"))
.significanceHeuristic(new MutualInformation.MutualInformationBuilder(true, false))))
.significanceHeuristic(significanceHeuristicExpectingSeparateSets)))
.addAggregation((new FilterAggregationBuilder("1"))
.filter(FilterBuilders.termFilter(CLASS_FIELD, "1"))
.subAggregation(new SignificantTermsBuilder("sig_terms")
.field(TEXT_FIELD)
.minDocCount(1)
.backgroundFilter(FilterBuilders.termFilter(CLASS_FIELD, "0"))
.significanceHeuristic(new MutualInformation.MutualInformationBuilder(true, false))))
.significanceHeuristic(significanceHeuristicExpectingSeparateSets)))
.execute()
.actionGet();
@ -302,7 +309,7 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
assertThat(sigTerms0.getBuckets().size(), equalTo(2));
double score00Background = sigTerms0.getBucketByKey("0").getSignificanceScore();
double score01Background = sigTerms0.getBucketByKey("1").getSignificanceScore();
SignificantTerms sigTerms1 = ((SignificantTerms) (((StringTerms) response1.getAggregations().get("class")).getBucketByKey("0").getAggregations().asMap().get("sig_terms")));
SignificantTerms sigTerms1 = ((SignificantTerms) (((StringTerms) response1.getAggregations().get("class")).getBucketByKey("1").getAggregations().asMap().get("sig_terms")));
double score10Background = sigTerms1.getBucketByKey("0").getSignificanceScore();
double score11Background = sigTerms1.getBucketByKey("1").getSignificanceScore();
@ -340,14 +347,20 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
}
@Test
public void testMutualInformationEqual() throws Exception {
public void testScoresEqualForPositiveAndNegative() throws Exception {
indexEqualTestData();
//now, check that results for both classes are the same with exclude negatives = false and classes are routing ids
testScoresEqualForPositiveAndNegative(new MutualInformation.MutualInformationBuilder(true, true));
testScoresEqualForPositiveAndNegative(new ChiSquare.ChiSquareBuilder(true, true));
}
public void testScoresEqualForPositiveAndNegative(SignificanceHeuristicBuilder heuristic) throws Exception {
//check that results for both classes are the same with exclude negatives = false and classes are routing ids
SearchResponse response = client().prepareSearch("test")
.addAggregation(new TermsBuilder("class").field("class").subAggregation(new SignificantTermsBuilder("mySignificantTerms")
.field("text")
.executionHint(randomExecutionHint())
.significanceHeuristic(new MutualInformation.MutualInformationBuilder(true, true))
.significanceHeuristic(heuristic)
.minDocCount(1).shardSize(1000).size(1000)))
.execute()
.actionGet();

View File

@ -29,6 +29,8 @@ import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms.Bucket;
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsAggregatorFactory.ExecutionMode;
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsBuilder;
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare;
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND;
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore;
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation;
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
@ -186,8 +188,38 @@ public class SignificantTermsTests extends ElasticsearchIntegrationTest {
assertSearchResponse(response);
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
checkExpectedStringTermsFound(topTerms);
}
}
@Test
public void textAnalysisGND() throws Exception {
SearchResponse response = client().prepareSearch("test")
.setSearchType(SearchType.QUERY_AND_FETCH)
.setQuery(new TermQueryBuilder("_all", "terje"))
.setFrom(0).setSize(60).setExplain(true)
.addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description").executionHint(randomExecutionHint()).significanceHeuristic(new GND.GNDBuilder(true))
.minDocCount(2))
.execute()
.actionGet();
assertSearchResponse(response);
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
checkExpectedStringTermsFound(topTerms);
}
@Test
public void textAnalysisChiSquare() throws Exception {
SearchResponse response = client().prepareSearch("test")
.setSearchType(SearchType.QUERY_AND_FETCH)
.setQuery(new TermQueryBuilder("_all", "terje"))
.setFrom(0).setSize(60).setExplain(true)
.addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description").executionHint(randomExecutionHint()).significanceHeuristic(new ChiSquare.ChiSquareBuilder(false,true))
.minDocCount(2))
.execute()
.actionGet();
assertSearchResponse(response);
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
checkExpectedStringTermsFound(topTerms);
}
@Test
public void badFilteredAnalysis() throws Exception {
// Deliberately using a bad choice of filter here for the background context in order

View File

@ -39,8 +39,10 @@ import org.junit.Test;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import static org.hamcrest.Matchers.*;
@ -66,6 +68,8 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
public void streamResponse() throws Exception {
SignificanceHeuristicStreams.registerStream(MutualInformation.STREAM, MutualInformation.STREAM.getName());
SignificanceHeuristicStreams.registerStream(JLHScore.STREAM, JLHScore.STREAM.getName());
SignificanceHeuristicStreams.registerStream(GND.STREAM, GND.STREAM.getName());
SignificanceHeuristicStreams.registerStream(ChiSquare.STREAM, ChiSquare.STREAM.getName());
Version version = ElasticsearchIntegrationTest.randomVersion();
InternalSignificantTerms[] sigTerms = getRandomSignificantTerms(getRandomSignificanceheuristic());
@ -109,11 +113,12 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
}
SignificanceHeuristic getRandomSignificanceheuristic() {
if (randomBoolean()) {
return JLHScore.INSTANCE;
} else {
return new MutualInformation(randomBoolean(), true);
}
List<SignificanceHeuristic> heuristics = new ArrayList<>();
heuristics.add(JLHScore.INSTANCE);
heuristics.add(new MutualInformation(randomBoolean(), randomBoolean()));
heuristics.add(new GND(randomBoolean()));
heuristics.add(new ChiSquare(randomBoolean(), randomBoolean()));
return heuristics.get(randomInt(3));
}
// test that
@ -125,110 +130,111 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
Set<SignificanceHeuristicParser> parsers = new HashSet<>();
parsers.add(new JLHScore.JLHScoreParser());
parsers.add(new MutualInformation.MutualInformationParser());
parsers.add(new GND.GNDParser());
parsers.add(new ChiSquare.ChiSquareParser());
SignificanceHeuristicParserMapper heuristicParserMapper = new SignificanceHeuristicParserMapper(parsers);
SearchContext searchContext = new SignificantTermsTestSearchContext();
// test default with string
XContentParser stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", \"jlh\":{}, \"min_doc_count\":200}");
// test jlh with string
assertTrue(parseFromString(heuristicParserMapper, searchContext, "\"jlh\":{}") instanceof JLHScore);
// test gnd with string
assertTrue(parseFromString(heuristicParserMapper, searchContext, "\"gnd\":{}") instanceof GND);
// test mutual information with string
boolean includeNegatives = randomBoolean();
boolean backgroundIsSuperset = randomBoolean();
assertThat(parseFromString(heuristicParserMapper, searchContext, "\"mutual_information\":{\"include_negatives\": " + includeNegatives + ", \"background_is_superset\":" + backgroundIsSuperset + "}"), equalTo((SignificanceHeuristic) (new MutualInformation(includeNegatives, backgroundIsSuperset))));
assertThat(parseFromString(heuristicParserMapper, searchContext, "\"chi_square\":{\"include_negatives\": " + includeNegatives + ", \"background_is_superset\":" + backgroundIsSuperset + "}"), equalTo((SignificanceHeuristic) (new ChiSquare(includeNegatives, backgroundIsSuperset))));
// test with builders
assertTrue(parseFromBuilder(heuristicParserMapper, searchContext, new JLHScore.JLHScoreBuilder()) instanceof JLHScore);
assertTrue(parseFromBuilder(heuristicParserMapper, searchContext, new GND.GNDBuilder(backgroundIsSuperset)) instanceof GND);
assertThat(parseFromBuilder(heuristicParserMapper, searchContext, new MutualInformation.MutualInformationBuilder(includeNegatives, backgroundIsSuperset)), equalTo((SignificanceHeuristic) new MutualInformation(includeNegatives, backgroundIsSuperset)));
assertThat(parseFromBuilder(heuristicParserMapper, searchContext, new ChiSquare.ChiSquareBuilder(includeNegatives, backgroundIsSuperset)), equalTo((SignificanceHeuristic) new ChiSquare(includeNegatives, backgroundIsSuperset)));
// test exceptions
String faultyHeuristicdefinition = "\"mutual_information\":{\"include_negatives\": false, \"some_unknown_field\": false}";
String expectedError = "unknown for mutual_information";
checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError);
faultyHeuristicdefinition = "\"chi_square\":{\"unknown_field\": true}";
expectedError = "unknown for chi_square";
checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError);
faultyHeuristicdefinition = "\"jlh\":{\"unknown_field\": true}";
expectedError = "expected }, got ";
checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError);
faultyHeuristicdefinition = "\"gnd\":{\"unknown_field\": true}";
expectedError = "unknown for gnd";
checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError);
}
protected void checkParseException(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, String faultyHeuristicDefinition, String expectedError) throws IOException {
try {
XContentParser stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", " + faultyHeuristicDefinition + ",\"min_doc_count\":200}");
stParser.nextToken();
new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
fail();
} catch (ElasticsearchParseException e) {
assertTrue(e.getMessage().contains(expectedError));
}
}
protected SignificanceHeuristic parseFromBuilder(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, SignificanceHeuristicBuilder significanceHeuristicBuilder) throws IOException {
SignificantTermsBuilder stBuilder = new SignificantTermsBuilder("testagg");
stBuilder.significanceHeuristic(significanceHeuristicBuilder).field("text").minDocCount(200);
XContentBuilder stXContentBuilder = XContentFactory.jsonBuilder();
stBuilder.internalXContent(stXContentBuilder, null);
XContentParser stParser = JsonXContent.jsonXContent.createParser(stXContentBuilder.string());
return parseSignificanceHeuristic(heuristicParserMapper, searchContext, stParser);
}
private SignificanceHeuristic parseSignificanceHeuristic(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, XContentParser stParser) throws IOException {
stParser.nextToken();
SignificantTermsAggregatorFactory aggregatorFactory = (SignificantTermsAggregatorFactory) new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
stParser.nextToken();
assertThat(aggregatorFactory.getBucketCountThresholds().getMinDocCount(), equalTo(200l));
assertThat(stParser.currentToken(), equalTo(null));
stParser.close();
// test default with builders
SignificantTermsBuilder stBuilder = new SignificantTermsBuilder("testagg");
stBuilder.significanceHeuristic(new JLHScore.JLHScoreBuilder()).field("text").minDocCount(200);
XContentBuilder stXContentBuilder = XContentFactory.jsonBuilder();
stBuilder.internalXContent(stXContentBuilder, null);
stParser = JsonXContent.jsonXContent.createParser(stXContentBuilder.string());
stParser.nextToken();
aggregatorFactory = (SignificantTermsAggregatorFactory) new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
stParser.nextToken();
assertThat(aggregatorFactory.getBucketCountThresholds().getMinDocCount(), equalTo(200l));
assertThat(stParser.currentToken(), equalTo(null));
stParser.close();
// test mutual_information with string
stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", \"mutual_information\":{\"include_negatives\": false}, \"min_doc_count\":200}");
stParser.nextToken();
aggregatorFactory = (SignificantTermsAggregatorFactory) new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
stParser.nextToken();
assertThat(aggregatorFactory.getBucketCountThresholds().getMinDocCount(), equalTo(200l));
assertTrue(!((MutualInformation) aggregatorFactory.getSignificanceHeuristic()).getIncludeNegatives());
assertThat(stParser.currentToken(), equalTo(null));
stParser.close();
// test mutual_information with builders
stBuilder = new SignificantTermsBuilder("testagg");
stBuilder.significanceHeuristic(new MutualInformation.MutualInformationBuilder(false, true)).field("text").minDocCount(200);
stXContentBuilder = XContentFactory.jsonBuilder();
stBuilder.internalXContent(stXContentBuilder, null);
stParser = JsonXContent.jsonXContent.createParser(stXContentBuilder.string());
stParser.nextToken();
aggregatorFactory = (SignificantTermsAggregatorFactory) new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
stParser.nextToken();
assertThat(aggregatorFactory.getBucketCountThresholds().getMinDocCount(), equalTo(200l));
assertTrue(!((MutualInformation) aggregatorFactory.getSignificanceHeuristic()).getIncludeNegatives());
assertThat(stParser.currentToken(), equalTo(null));
stParser.close();
// test exceptions
try {
// 1. invalid field
stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", \"mutual_information\":{\"include_negatives\": false, \"some_unknown_field\": false}\"min_doc_count\":200}");
stParser.nextToken();
new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
fail();
} catch (ElasticsearchParseException e) {
assertTrue(e.getMessage().contains("unknown for mutual_information"));
}
try {
// 2. unknown field in jlh_score
stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", \"jlh\":{\"unknown_field\": true}, \"min_doc_count\":200}");
stParser.nextToken();
new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
fail();
} catch (ElasticsearchParseException e) {
assertTrue(e.getMessage().contains("expected }, got "));
}
return aggregatorFactory.getSignificanceHeuristic();
}
@Test
public void testAssertions() throws Exception {
MutualInformation mutualInformation = new MutualInformation(true, true);
protected SignificanceHeuristic parseFromString(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, String heuristicString) throws IOException {
XContentParser stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", " + heuristicString + ", \"min_doc_count\":200}");
return parseSignificanceHeuristic(heuristicParserMapper, searchContext, stParser);
}
void testBackgroundAssertions(SignificanceHeuristic heuristicIsSuperset, SignificanceHeuristic heuristicNotSuperset) {
try {
mutualInformation.getScore(2, 3, 1, 4);
heuristicIsSuperset.getScore(2, 3, 1, 4);
fail();
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
assertNotNull(illegalArgumentException.getMessage());
assertTrue(illegalArgumentException.getMessage().contains("subsetFreq > supersetFreq"));
}
try {
mutualInformation.getScore(1, 4, 2, 3);
heuristicIsSuperset.getScore(1, 4, 2, 3);
fail();
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
assertNotNull(illegalArgumentException.getMessage());
assertTrue(illegalArgumentException.getMessage().contains("subsetSize > supersetSize"));
}
try {
mutualInformation.getScore(2, 1, 3, 4);
heuristicIsSuperset.getScore(2, 1, 3, 4);
fail();
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
assertNotNull(illegalArgumentException.getMessage());
assertTrue(illegalArgumentException.getMessage().contains("subsetFreq > subsetSize"));
}
try {
mutualInformation.getScore(1, 2, 4, 3);
heuristicIsSuperset.getScore(1, 2, 4, 3);
fail();
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
assertNotNull(illegalArgumentException.getMessage());
assertTrue(illegalArgumentException.getMessage().contains("supersetFreq > supersetSize"));
}
try {
mutualInformation.getScore(1, 3, 4, 4);
heuristicIsSuperset.getScore(1, 3, 4, 4);
fail();
} catch (ElasticsearchIllegalArgumentException assertionError) {
assertNotNull(assertionError.getMessage());
@ -238,70 +244,58 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
int idx = randomInt(3);
long[] values = {1, 2, 3, 4};
values[idx] *= -1;
mutualInformation.getScore(values[0], values[1], values[2], values[3]);
heuristicIsSuperset.getScore(values[0], values[1], values[2], values[3]);
fail();
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
assertNotNull(illegalArgumentException.getMessage());
assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive"));
}
mutualInformation = new MutualInformation(true, false);
double score = mutualInformation.getScore(2, 3, 1, 4);
assertThat(score, greaterThanOrEqualTo(0.0));
assertThat(score, lessThanOrEqualTo(1.0));
score = mutualInformation.getScore(1, 4, 2, 3);
assertThat(score, greaterThanOrEqualTo(0.0));
assertThat(score, lessThanOrEqualTo(1.0));
try {
mutualInformation.getScore(2, 1, 3, 4);
heuristicNotSuperset.getScore(2, 1, 3, 4);
fail();
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
assertNotNull(illegalArgumentException.getMessage());
assertTrue(illegalArgumentException.getMessage().contains("subsetFreq > subsetSize"));
}
try {
mutualInformation.getScore(1, 2, 4, 3);
fail();
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
assertNotNull(illegalArgumentException.getMessage());
assertTrue(illegalArgumentException.getMessage().contains("supersetFreq > supersetSize"));
}
score = mutualInformation.getScore(1, 3, 4, 4);
assertThat(score, greaterThanOrEqualTo(0.0));
assertThat(score, lessThanOrEqualTo(1.0));
try {
int idx = randomInt(3);
long[] values = {1, 2, 3, 4};
values[idx] *= -1;
mutualInformation.getScore(values[0], values[1], values[2], values[3]);
fail();
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
assertNotNull(illegalArgumentException.getMessage());
assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive"));
}
JLHScore jlhScore = JLHScore.INSTANCE;
try {
int idx = randomInt(3);
long[] values = {1, 2, 3, 4};
values[idx] *= -1;
jlhScore.getScore(values[0], values[1], values[2], values[3]);
fail();
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
assertNotNull(illegalArgumentException.getMessage());
assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive"));
}
try {
jlhScore.getScore(1, 2, 4, 3);
heuristicNotSuperset.getScore(1, 2, 4, 3);
fail();
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
assertNotNull(illegalArgumentException.getMessage());
assertTrue(illegalArgumentException.getMessage().contains("supersetFreq > supersetSize"));
}
try {
jlhScore.getScore(2, 1, 3, 4);
int idx = randomInt(3);
long[] values = {1, 2, 3, 4};
values[idx] *= -1;
heuristicNotSuperset.getScore(values[0], values[1], values[2], values[3]);
fail();
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
assertNotNull(illegalArgumentException.getMessage());
assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive"));
}
}
void testAssertions(SignificanceHeuristic heuristic) {
try {
int idx = randomInt(3);
long[] values = {1, 2, 3, 4};
values[idx] *= -1;
heuristic.getScore(values[0], values[1], values[2], values[3]);
fail();
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
assertNotNull(illegalArgumentException.getMessage());
assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive"));
}
try {
heuristic.getScore(1, 2, 4, 3);
fail();
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
assertNotNull(illegalArgumentException.getMessage());
assertTrue(illegalArgumentException.getMessage().contains("supersetFreq > supersetSize"));
}
try {
heuristic.getScore(2, 1, 3, 4);
fail();
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
assertNotNull(illegalArgumentException.getMessage());
@ -310,11 +304,30 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
}
@Test
public void scoreDefault() {
SignificanceHeuristic heuristic = JLHScore.INSTANCE;
public void testAssertions() throws Exception {
testBackgroundAssertions(new MutualInformation(true, true), new MutualInformation(true, false));
testBackgroundAssertions(new ChiSquare(true, true), new ChiSquare(true, false));
testBackgroundAssertions(new GND(true), new GND(false));
testAssertions(JLHScore.INSTANCE);
}
@Test
public void basicScoreProperties() {
basicScoreProperties(JLHScore.INSTANCE, true);
basicScoreProperties(new GND(true), true);
basicScoreProperties(new MutualInformation(true, true), false);
basicScoreProperties(new ChiSquare(true, true), false);
}
public void basicScoreProperties(SignificanceHeuristic heuristic, boolean test0) {
assertThat(heuristic.getScore(1, 1, 1, 3), greaterThan(0.0));
assertThat(heuristic.getScore(1, 1, 2, 3), lessThan(heuristic.getScore(1, 1, 1, 3)));
assertThat(heuristic.getScore(0, 1, 2, 3), equalTo(0.0));
assertThat(heuristic.getScore(1, 1, 3, 4), lessThan(heuristic.getScore(1, 1, 2, 4)));
if (test0) {
assertThat(heuristic.getScore(0, 1, 2, 3), equalTo(0.0));
}
double score = 0.0;
try {
long a = randomLong();
@ -350,7 +363,34 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
assertThat(score, lessThanOrEqualTo(1.0));
assertThat(score, greaterThanOrEqualTo(0.0));
heuristic = new MutualInformation(false, true);
assertThat(heuristic.getScore(0, 1, 2, 3), equalTo(-1.0 * Double.MAX_VALUE));
assertThat(heuristic.getScore(0, 1, 2, 3), equalTo(Double.NEGATIVE_INFINITY));
heuristic = new MutualInformation(true, false);
score = heuristic.getScore(2, 3, 1, 4);
assertThat(score, greaterThanOrEqualTo(0.0));
assertThat(score, lessThanOrEqualTo(1.0));
score = heuristic.getScore(1, 4, 2, 3);
assertThat(score, greaterThanOrEqualTo(0.0));
assertThat(score, lessThanOrEqualTo(1.0));
score = heuristic.getScore(1, 3, 4, 4);
assertThat(score, greaterThanOrEqualTo(0.0));
assertThat(score, lessThanOrEqualTo(1.0));
}
@Test
public void testGNDCornerCases() throws Exception {
GND gnd = new GND(true);
//term is only in the subset, not at all in the other set but that is because the other set is empty.
// this should actually not happen because only terms that are in the subset are considered now,
// however, in this case the score should be 0 because a term that does not exist cannot be relevant...
assertThat(gnd.getScore(0, randomIntBetween(1, 2), 0, randomIntBetween(2,3)), equalTo(0.0));
// the terms do not co-occur at all - should be 0
assertThat(gnd.getScore(0, randomIntBetween(1, 2), randomIntBetween(2, 3), randomIntBetween(5,6)), equalTo(0.0));
// comparison between two terms that do not exist - probably not relevant
assertThat(gnd.getScore(0, 0, 0, randomIntBetween(1,2)), equalTo(0.0));
// terms co-occur perfectly - should be 1
assertThat(gnd.getScore(1, 1, 1, 1), equalTo(1.0));
gnd = new GND(false);
assertThat(gnd.getScore(0, 0, 0, 0), equalTo(0.0));
}
}