significant terms: add google normalized distance, add chi square
closes #6858
This commit is contained in:
parent
5f0719fd50
commit
a3cefd919e
|
@ -306,7 +306,45 @@ Per default, the assumption is that the documents in the bucket are also contain
|
|||
"background_is_superset": false
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
|
||||
===== Chi square
|
||||
added[1.4.0]
|
||||
|
||||
Chi square as described in "Information Retrieval", Manning et al., Chapter 13.5.2 can be used as significance score by adding the parameter
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
|
||||
"chi_square": {
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
Chi square behaves like mutual information and can be configured with the same parameters `include_negatives` and `background_is_superset`.
|
||||
|
||||
|
||||
===== google normalized distance
|
||||
added[1.4.0]
|
||||
|
||||
Google normalized distance as described in "The Google Similarity Distance", Cilibrasi and Vitanyi, 2007 (http://arxiv.org/pdf/cs/0412098v3.pdf) can be used as significance score by adding the parameter
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
|
||||
"gnd": {
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
`gnd` also accepts the `background_is_superset` parameter.
|
||||
|
||||
===== Which one is best?
|
||||
|
||||
|
||||
Roughly, `mutual_information` prefers high frequent terms even if they occur also frequently in the background. For example, in an analysis of natural language text this might lead to selection of stop words. `mutual_information` is unlikely to select very rare terms like misspellings. `gnd` prefers terms with a high co-occurence and avoids selection of stopwords. It might be better suited for synonym detection. However, `gnd` has a tendency to select very rare terms that are, for example, a result of misspelling. `chi_square` and `jlh` are somewhat in-between.
|
||||
|
||||
It is hard to say which one of the different heuristics will be the best choice as it depends on what the significant terms are used for (see for example [Yang and Pedersen, "A Comparative Study on Feature Selection in Text Categorization", 1997](http://courses.ischool.berkeley.edu/i256/f06/papers/yang97comparative.pdf) for a study on using significant terms for feature selection for text classification).
|
||||
|
||||
|
||||
|
||||
|
||||
===== Size & Shard Size
|
||||
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
|
||||
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
|
||||
|
||||
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class ChiSquare extends NXYSignificanceHeuristic {
|
||||
|
||||
protected static final ParseField NAMES_FIELD = new ParseField("chi_square");
|
||||
|
||||
public ChiSquare(boolean includeNegatives, boolean backgroundIsSuperset) {
|
||||
super(includeNegatives, backgroundIsSuperset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (!(other instanceof ChiSquare)) {
|
||||
return false;
|
||||
}
|
||||
return super.equals(other);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = NAMES_FIELD.getPreferredName().hashCode();
|
||||
result = 31 * result + super.hashCode();
|
||||
return result;
|
||||
}
|
||||
|
||||
public static final SignificanceHeuristicStreams.Stream STREAM = new SignificanceHeuristicStreams.Stream() {
|
||||
@Override
|
||||
public SignificanceHeuristic readResult(StreamInput in) throws IOException {
|
||||
return new ChiSquare(in.readBoolean(), in.readBoolean());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return NAMES_FIELD.getPreferredName();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Calculates Chi^2
|
||||
* see "Information Retrieval", Manning et al., Eq. 13.19
|
||||
*/
|
||||
@Override
|
||||
public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
|
||||
Frequencies frequencies = computeNxys(subsetFreq, subsetSize, supersetFreq, supersetSize, "ChiSquare");
|
||||
|
||||
// here we check if the term appears more often in subset than in background without subset.
|
||||
if (!includeNegatives && frequencies.N11 / frequencies.N_1 < frequencies.N10 / frequencies.N_0) {
|
||||
return Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
return (frequencies.N * Math.pow((frequencies.N11 * frequencies.N00 - frequencies.N01 * frequencies.N10), 2.0) /
|
||||
((frequencies.N_1) * (frequencies.N1_) * (frequencies.N0_) * (frequencies.N_0)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
out.writeString(STREAM.getName());
|
||||
super.writeTo(out);
|
||||
}
|
||||
|
||||
public static class ChiSquareParser extends NXYParser {
|
||||
|
||||
@Override
|
||||
protected SignificanceHeuristic newHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) {
|
||||
return new ChiSquare(includeNegatives, backgroundIsSuperset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] getNames() {
|
||||
return NAMES_FIELD.getAllNamesIncludedDeprecated();
|
||||
}
|
||||
}
|
||||
|
||||
public static class ChiSquareBuilder extends NXYSignificanceHeuristic.NXYBuilder {
|
||||
|
||||
public ChiSquareBuilder(boolean includeNegatives, boolean backgroundIsSuperset) {
|
||||
super(includeNegatives, backgroundIsSuperset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void toXContent(XContentBuilder builder) throws IOException {
|
||||
builder.startObject(STREAM.getName());
|
||||
super.build(builder);
|
||||
builder.endObject();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,148 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
|
||||
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
|
||||
|
||||
|
||||
import org.elasticsearch.ElasticsearchParseException;
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.index.query.QueryParsingException;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class GND extends NXYSignificanceHeuristic {
|
||||
|
||||
protected static final ParseField NAMES_FIELD = new ParseField("gnd");
|
||||
|
||||
public GND(boolean backgroundIsSuperset) {
|
||||
super(true, backgroundIsSuperset);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (!(other instanceof GND)) {
|
||||
return false;
|
||||
}
|
||||
return super.equals(other);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = NAMES_FIELD.getPreferredName().hashCode();
|
||||
result = 31 * result + super.hashCode();
|
||||
return result;
|
||||
}
|
||||
|
||||
public static final SignificanceHeuristicStreams.Stream STREAM = new SignificanceHeuristicStreams.Stream() {
|
||||
@Override
|
||||
public SignificanceHeuristic readResult(StreamInput in) throws IOException {
|
||||
return new GND(in.readBoolean());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return NAMES_FIELD.getPreferredName();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Calculates Google Normalized Distance, as described in "The Google Similarity Distance", Cilibrasi and Vitanyi, 2007
|
||||
* link: http://arxiv.org/pdf/cs/0412098v3.pdf
|
||||
*/
|
||||
@Override
|
||||
public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
|
||||
|
||||
Frequencies frequencies = computeNxys(subsetFreq, subsetSize, supersetFreq, supersetSize, "GND");
|
||||
double fx = frequencies.N1_;
|
||||
double fy = frequencies.N_1;
|
||||
double fxy = frequencies.N11;
|
||||
double N = frequencies.N;
|
||||
if (fxy == 0) {
|
||||
// no co-occurrence
|
||||
return 0.0;
|
||||
}
|
||||
if ((fx == fy) && (fx == fxy)) {
|
||||
// perfect co-occurrence
|
||||
return 1.0;
|
||||
}
|
||||
double score = (Math.max(Math.log(fx), Math.log(fy)) - Math.log(fxy)) /
|
||||
(Math.log(N) - Math.min(Math.log(fx), Math.log(fy)));
|
||||
|
||||
//we must invert the order of terms because GND scores relevant terms low
|
||||
score = Math.exp(-1.0d * score);
|
||||
return score;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
out.writeString(STREAM.getName());
|
||||
out.writeBoolean(backgroundIsSuperset);
|
||||
}
|
||||
|
||||
public static class GNDParser extends NXYParser {
|
||||
|
||||
@Override
|
||||
public String[] getNames() {
|
||||
return NAMES_FIELD.getAllNamesIncludedDeprecated();
|
||||
}
|
||||
|
||||
protected SignificanceHeuristic newHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) {
|
||||
return new GND(backgroundIsSuperset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SignificanceHeuristic parse(XContentParser parser) throws IOException, QueryParsingException {
|
||||
String givenName = parser.currentName();
|
||||
boolean backgroundIsSuperset = true;
|
||||
XContentParser.Token token = parser.nextToken();
|
||||
while (!token.equals(XContentParser.Token.END_OBJECT)) {
|
||||
if (BACKGROUND_IS_SUPERSET.match(parser.currentName(), ParseField.EMPTY_FLAGS)) {
|
||||
parser.nextToken();
|
||||
backgroundIsSuperset = parser.booleanValue();
|
||||
} else {
|
||||
throw new ElasticsearchParseException("Field " + parser.currentName().toString() + " unknown for " + givenName);
|
||||
}
|
||||
token = parser.nextToken();
|
||||
}
|
||||
return newHeuristic(true, backgroundIsSuperset);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class GNDBuilder extends NXYBuilder {
|
||||
|
||||
public GNDBuilder(boolean backgroundIsSuperset) {
|
||||
super(true, backgroundIsSuperset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void toXContent(XContentBuilder builder) throws IOException {
|
||||
builder.startObject(STREAM.getName());
|
||||
builder.field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset);
|
||||
builder.endObject();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -31,7 +31,7 @@ import org.elasticsearch.index.query.QueryParsingException;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
public class JLHScore implements SignificanceHeuristic {
|
||||
public class JLHScore extends SignificanceHeuristic {
|
||||
|
||||
public static final JLHScore INSTANCE = new JLHScore();
|
||||
|
||||
|
@ -59,25 +59,10 @@ public class JLHScore implements SignificanceHeuristic {
|
|||
* Calculates the significance of a term in a sample against a background of
|
||||
* normal distributions by comparing the changes in frequency. This is the heart
|
||||
* of the significant terms feature.
|
||||
* <p/>
|
||||
*
|
||||
* @param subsetFreq The frequency of the term in the selected sample
|
||||
* @param subsetSize The size of the selected sample (typically number of docs)
|
||||
* @param supersetFreq The frequency of the term in the superset from which the sample was taken
|
||||
* @param supersetSize The size of the superset from which the sample was taken (typically number of docs)
|
||||
* @return a "significance" score
|
||||
*/
|
||||
@Override
|
||||
public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
|
||||
if (subsetFreq < 0 || subsetSize < 0 || supersetFreq < 0 || supersetSize < 0) {
|
||||
throw new ElasticsearchIllegalArgumentException("Frequencies of subset and superset must be positive in JLHScore.getScore()");
|
||||
}
|
||||
if (subsetFreq > subsetSize) {
|
||||
throw new ElasticsearchIllegalArgumentException("subsetFreq > subsetSize, in JLHScore.score(..)");
|
||||
}
|
||||
if (supersetFreq > supersetSize) {
|
||||
throw new ElasticsearchIllegalArgumentException("supersetFreq > supersetSize, in JLHScore.score(..)");
|
||||
}
|
||||
checkFrequencyValidity(subsetFreq, subsetSize, supersetFreq, supersetSize, "JLHScore");
|
||||
if ((subsetSize == 0) || (supersetSize == 0)) {
|
||||
// avoid any divide by zero issues
|
||||
return 0;
|
||||
|
|
|
@ -21,50 +21,36 @@
|
|||
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
|
||||
|
||||
|
||||
import org.elasticsearch.ElasticsearchIllegalArgumentException;
|
||||
import org.elasticsearch.ElasticsearchParseException;
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.index.query.QueryParsingException;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class MutualInformation implements SignificanceHeuristic {
|
||||
public class MutualInformation extends NXYSignificanceHeuristic {
|
||||
|
||||
protected static final ParseField NAMES_FIELD = new ParseField("mutual_information");
|
||||
|
||||
protected static final ParseField INCLUDE_NEGATIVES_FIELD = new ParseField("include_negatives");
|
||||
|
||||
protected static final ParseField BACKGROUND_IS_SUPERSET = new ParseField("background_is_superset");
|
||||
|
||||
protected static final String SCORE_ERROR_MESSAGE = ", does your background filter not include all documents in the bucket? If so and it is intentional, set \"" + BACKGROUND_IS_SUPERSET.getPreferredName() + "\": false";
|
||||
|
||||
private static final double log2 = Math.log(2.0);
|
||||
|
||||
/**
|
||||
* Mutual information does not differentiate between terms that are descriptive for subset or for
|
||||
* the background without the subset. We might want to filter out the terms that are appear much less often
|
||||
* in the subset than in the background without the subset.
|
||||
*/
|
||||
protected boolean includeNegatives = false;
|
||||
private boolean backgroundIsSuperset = true;
|
||||
|
||||
private MutualInformation() {};
|
||||
|
||||
public MutualInformation(boolean includeNegatives, boolean backgroundIsSuperset) {
|
||||
this.includeNegatives = includeNegatives;
|
||||
this.backgroundIsSuperset = backgroundIsSuperset;
|
||||
super(includeNegatives, backgroundIsSuperset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (! (other instanceof MutualInformation)) {
|
||||
if (!(other instanceof MutualInformation)) {
|
||||
return false;
|
||||
}
|
||||
return ((MutualInformation)other).includeNegatives == includeNegatives && ((MutualInformation)other).backgroundIsSuperset == backgroundIsSuperset;
|
||||
return super.equals(other);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = NAMES_FIELD.getPreferredName().hashCode();
|
||||
result = 31 * result + super.hashCode();
|
||||
return result;
|
||||
}
|
||||
|
||||
public static final SignificanceHeuristicStreams.Stream STREAM = new SignificanceHeuristicStreams.Stream() {
|
||||
|
@ -82,88 +68,23 @@ public class MutualInformation implements SignificanceHeuristic {
|
|||
/**
|
||||
* Calculates mutual information
|
||||
* see "Information Retrieval", Manning et al., Eq. 13.17
|
||||
*
|
||||
* @param subsetFreq The frequency of the term in the selected sample
|
||||
* @param subsetSize The size of the selected sample (typically number of docs)
|
||||
* @param supersetFreq The frequency of the term in the superset from which the sample was taken
|
||||
* @param supersetSize The size of the superset from which the sample was taken (typically number of docs)
|
||||
* @return a "significance" score
|
||||
*/
|
||||
@Override
|
||||
public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
|
||||
if (subsetFreq < 0 || subsetSize < 0 || supersetFreq < 0 || supersetSize < 0) {
|
||||
throw new ElasticsearchIllegalArgumentException("Frequencies of subset and superset must be positive in MutualInformation.getScore()");
|
||||
}
|
||||
if (subsetFreq > subsetSize) {
|
||||
throw new ElasticsearchIllegalArgumentException("subsetFreq > subsetSize, in MutualInformation.score(..)");
|
||||
}
|
||||
if (supersetFreq > supersetSize) {
|
||||
throw new ElasticsearchIllegalArgumentException("supersetFreq > supersetSize, in MutualInformation.score(..)");
|
||||
}
|
||||
if (backgroundIsSuperset) {
|
||||
if (subsetFreq > supersetFreq) {
|
||||
throw new ElasticsearchIllegalArgumentException("subsetFreq > supersetFreq" + SCORE_ERROR_MESSAGE);
|
||||
}
|
||||
if (subsetSize > supersetSize) {
|
||||
throw new ElasticsearchIllegalArgumentException("subsetSize > supersetSize" + SCORE_ERROR_MESSAGE);
|
||||
}
|
||||
if (supersetFreq - subsetFreq > supersetSize - subsetSize) {
|
||||
throw new ElasticsearchIllegalArgumentException("supersetFreq - subsetFreq > supersetSize - subsetSize" + SCORE_ERROR_MESSAGE);
|
||||
}
|
||||
}
|
||||
double N00, N01, N10, N11, N0_, N1_, N_0, N_1, N;
|
||||
if (backgroundIsSuperset) {
|
||||
//documents not in class and do not contain term
|
||||
N00 = supersetSize - supersetFreq - (subsetSize - subsetFreq);
|
||||
//documents in class and do not contain term
|
||||
N01 = (subsetSize - subsetFreq);
|
||||
// documents not in class and do contain term
|
||||
N10 = supersetFreq - subsetFreq;
|
||||
// documents in class and do contain term
|
||||
N11 = subsetFreq;
|
||||
//documents that do not contain term
|
||||
N0_ = supersetSize - supersetFreq;
|
||||
//documents that contain term
|
||||
N1_ = supersetFreq;
|
||||
//documents that are not in class
|
||||
N_0 = supersetSize - subsetSize;
|
||||
//documents that are in class
|
||||
N_1 = subsetSize;
|
||||
//all docs
|
||||
N = supersetSize;
|
||||
} else {
|
||||
//documents not in class and do not contain term
|
||||
N00 = supersetSize - supersetFreq;
|
||||
//documents in class and do not contain term
|
||||
N01 = subsetSize - subsetFreq;
|
||||
// documents not in class and do contain term
|
||||
N10 = supersetFreq;
|
||||
// documents in class and do contain term
|
||||
N11 = subsetFreq;
|
||||
//documents that do not contain term
|
||||
N0_ = supersetSize - supersetFreq + subsetSize - subsetFreq;
|
||||
//documents that contain term
|
||||
N1_ = supersetFreq + subsetFreq;
|
||||
//documents that are not in class
|
||||
N_0 = supersetSize;
|
||||
//documents that are in class
|
||||
N_1 = subsetSize;
|
||||
//all docs
|
||||
N = supersetSize + subsetSize;
|
||||
}
|
||||
Frequencies frequencies = computeNxys(subsetFreq, subsetSize, supersetFreq, supersetSize, "MutualInformation");
|
||||
|
||||
double score = (getMITerm(N00, N0_, N_0, N) +
|
||||
getMITerm(N01, N0_, N_1, N) +
|
||||
getMITerm(N10, N1_, N_0, N) +
|
||||
getMITerm(N11, N1_, N_1, N))
|
||||
double score = (getMITerm(frequencies.N00, frequencies.N0_, frequencies.N_0, frequencies.N) +
|
||||
getMITerm(frequencies.N01, frequencies.N0_, frequencies.N_1, frequencies.N) +
|
||||
getMITerm(frequencies.N10, frequencies.N1_, frequencies.N_0, frequencies.N) +
|
||||
getMITerm(frequencies.N11, frequencies.N1_, frequencies.N_1, frequencies.N))
|
||||
/ log2;
|
||||
|
||||
if (Double.isNaN(score)) {
|
||||
score = -1.0 * Float.MAX_VALUE;
|
||||
score = Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
// here we check if the term appears more often in subset than in background without subset.
|
||||
if (!includeNegatives && N11 / N_1 < N10 / N_0) {
|
||||
score = -1.0 * Double.MAX_VALUE;
|
||||
if (!includeNegatives && frequencies.N11 / frequencies.N_1 < frequencies.N10 / frequencies.N_0) {
|
||||
score = Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
return score;
|
||||
}
|
||||
|
@ -194,43 +115,13 @@ public class MutualInformation implements SignificanceHeuristic {
|
|||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
out.writeString(STREAM.getName());
|
||||
out.writeBoolean(includeNegatives);
|
||||
out.writeBoolean(backgroundIsSuperset);
|
||||
|
||||
super.writeTo(out);
|
||||
}
|
||||
|
||||
public boolean getIncludeNegatives() {
|
||||
return includeNegatives;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = (includeNegatives ? 1 : 0);
|
||||
result = 31 * result + (backgroundIsSuperset ? 1 : 0);
|
||||
return result;
|
||||
}
|
||||
|
||||
public static class MutualInformationParser implements SignificanceHeuristicParser {
|
||||
public static class MutualInformationParser extends NXYParser {
|
||||
|
||||
@Override
|
||||
public SignificanceHeuristic parse(XContentParser parser) throws IOException, QueryParsingException {
|
||||
NAMES_FIELD.match(parser.currentName(), ParseField.EMPTY_FLAGS);
|
||||
boolean includeNegatives = false;
|
||||
boolean backgroundIsSuperset = true;
|
||||
XContentParser.Token token = parser.nextToken();
|
||||
while (!token.equals(XContentParser.Token.END_OBJECT)) {
|
||||
if (INCLUDE_NEGATIVES_FIELD.match(parser.currentName(), ParseField.EMPTY_FLAGS)) {
|
||||
parser.nextToken();
|
||||
includeNegatives = parser.booleanValue();
|
||||
} else if (BACKGROUND_IS_SUPERSET.match(parser.currentName(), ParseField.EMPTY_FLAGS)) {
|
||||
parser.nextToken();
|
||||
backgroundIsSuperset = parser.booleanValue();
|
||||
} else {
|
||||
throw new ElasticsearchParseException("Field " + parser.currentName().toString() + " unknown for mutual_information.");
|
||||
}
|
||||
token = parser.nextToken();
|
||||
}
|
||||
// move to the closing bracket
|
||||
protected SignificanceHeuristic newHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) {
|
||||
return new MutualInformation(includeNegatives, backgroundIsSuperset);
|
||||
}
|
||||
|
||||
|
@ -240,23 +131,17 @@ public class MutualInformation implements SignificanceHeuristic {
|
|||
}
|
||||
}
|
||||
|
||||
public static class MutualInformationBuilder implements SignificanceHeuristicBuilder {
|
||||
|
||||
boolean includeNegatives = true;
|
||||
private boolean backgroundIsSuperset = true;
|
||||
|
||||
private MutualInformationBuilder() {};
|
||||
public static class MutualInformationBuilder extends NXYBuilder {
|
||||
|
||||
public MutualInformationBuilder(boolean includeNegatives, boolean backgroundIsSuperset) {
|
||||
this.includeNegatives = includeNegatives;
|
||||
this.backgroundIsSuperset = backgroundIsSuperset;
|
||||
super(includeNegatives, backgroundIsSuperset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void toXContent(XContentBuilder builder) throws IOException {
|
||||
builder.startObject(STREAM.getName())
|
||||
.field(INCLUDE_NEGATIVES_FIELD.getPreferredName(), includeNegatives)
|
||||
.field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset)
|
||||
.endObject();
|
||||
builder.startObject(STREAM.getName());
|
||||
super.build(builder);
|
||||
builder.endObject();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,180 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
|
||||
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
|
||||
|
||||
|
||||
import org.elasticsearch.ElasticsearchIllegalArgumentException;
|
||||
import org.elasticsearch.ElasticsearchParseException;
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.index.query.QueryParsingException;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public abstract class NXYSignificanceHeuristic extends SignificanceHeuristic {
|
||||
|
||||
protected static final ParseField BACKGROUND_IS_SUPERSET = new ParseField("background_is_superset");
|
||||
|
||||
protected static final ParseField INCLUDE_NEGATIVES_FIELD = new ParseField("include_negatives");
|
||||
|
||||
protected static final String SCORE_ERROR_MESSAGE = ", does your background filter not include all documents in the bucket? If so and it is intentional, set \"" + BACKGROUND_IS_SUPERSET.getPreferredName() + "\": false";
|
||||
|
||||
protected final boolean backgroundIsSuperset;
|
||||
|
||||
/**
|
||||
* Some heuristics do not differentiate between terms that are descriptive for subset or for
|
||||
* the background without the subset. We might want to filter out the terms that are appear much less often
|
||||
* in the subset than in the background without the subset.
|
||||
*/
|
||||
protected final boolean includeNegatives;
|
||||
|
||||
public NXYSignificanceHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) {
|
||||
this.includeNegatives = includeNegatives;
|
||||
this.backgroundIsSuperset = backgroundIsSuperset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
out.writeBoolean(includeNegatives);
|
||||
out.writeBoolean(backgroundIsSuperset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
return ((NXYSignificanceHeuristic) other).includeNegatives == includeNegatives && ((NXYSignificanceHeuristic) other).backgroundIsSuperset == backgroundIsSuperset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = (includeNegatives ? 1 : 0);
|
||||
result = 31 * result + (backgroundIsSuperset ? 1 : 0);
|
||||
return result;
|
||||
}
|
||||
|
||||
protected static class Frequencies {
|
||||
double N00, N01, N10, N11, N0_, N1_, N_0, N_1, N;
|
||||
}
|
||||
|
||||
protected Frequencies computeNxys(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize, String scoreFunctionName) {
|
||||
checkFrequencies(subsetFreq, subsetSize, supersetFreq, supersetSize, scoreFunctionName);
|
||||
Frequencies frequencies = new Frequencies();
|
||||
if (backgroundIsSuperset) {
|
||||
//documents not in class and do not contain term
|
||||
frequencies.N00 = supersetSize - supersetFreq - (subsetSize - subsetFreq);
|
||||
//documents in class and do not contain term
|
||||
frequencies.N01 = (subsetSize - subsetFreq);
|
||||
// documents not in class and do contain term
|
||||
frequencies.N10 = supersetFreq - subsetFreq;
|
||||
// documents in class and do contain term
|
||||
frequencies.N11 = subsetFreq;
|
||||
//documents that do not contain term
|
||||
frequencies.N0_ = supersetSize - supersetFreq;
|
||||
//documents that contain term
|
||||
frequencies.N1_ = supersetFreq;
|
||||
//documents that are not in class
|
||||
frequencies.N_0 = supersetSize - subsetSize;
|
||||
//documents that are in class
|
||||
frequencies.N_1 = subsetSize;
|
||||
//all docs
|
||||
frequencies.N = supersetSize;
|
||||
} else {
|
||||
//documents not in class and do not contain term
|
||||
frequencies.N00 = supersetSize - supersetFreq;
|
||||
//documents in class and do not contain term
|
||||
frequencies.N01 = subsetSize - subsetFreq;
|
||||
// documents not in class and do contain term
|
||||
frequencies.N10 = supersetFreq;
|
||||
// documents in class and do contain term
|
||||
frequencies.N11 = subsetFreq;
|
||||
//documents that do not contain term
|
||||
frequencies.N0_ = supersetSize - supersetFreq + subsetSize - subsetFreq;
|
||||
//documents that contain term
|
||||
frequencies.N1_ = supersetFreq + subsetFreq;
|
||||
//documents that are not in class
|
||||
frequencies.N_0 = supersetSize;
|
||||
//documents that are in class
|
||||
frequencies.N_1 = subsetSize;
|
||||
//all docs
|
||||
frequencies.N = supersetSize + subsetSize;
|
||||
}
|
||||
return frequencies;
|
||||
}
|
||||
|
||||
protected void checkFrequencies(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize, String scoreFunctionName) {
|
||||
checkFrequencyValidity(subsetFreq, subsetSize, supersetFreq, supersetSize, scoreFunctionName);
|
||||
if (backgroundIsSuperset) {
|
||||
if (subsetFreq > supersetFreq) {
|
||||
throw new ElasticsearchIllegalArgumentException("subsetFreq > supersetFreq" + SCORE_ERROR_MESSAGE);
|
||||
}
|
||||
if (subsetSize > supersetSize) {
|
||||
throw new ElasticsearchIllegalArgumentException("subsetSize > supersetSize" + SCORE_ERROR_MESSAGE);
|
||||
}
|
||||
if (supersetFreq - subsetFreq > supersetSize - subsetSize) {
|
||||
throw new ElasticsearchIllegalArgumentException("supersetFreq - subsetFreq > supersetSize - subsetSize" + SCORE_ERROR_MESSAGE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static abstract class NXYParser implements SignificanceHeuristicParser {
|
||||
|
||||
@Override
|
||||
public SignificanceHeuristic parse(XContentParser parser) throws IOException, QueryParsingException {
|
||||
String givenName = parser.currentName();
|
||||
boolean includeNegatives = false;
|
||||
boolean backgroundIsSuperset = true;
|
||||
XContentParser.Token token = parser.nextToken();
|
||||
while (!token.equals(XContentParser.Token.END_OBJECT)) {
|
||||
if (INCLUDE_NEGATIVES_FIELD.match(parser.currentName())) {
|
||||
parser.nextToken();
|
||||
includeNegatives = parser.booleanValue();
|
||||
} else if (BACKGROUND_IS_SUPERSET.match(parser.currentName())) {
|
||||
parser.nextToken();
|
||||
backgroundIsSuperset = parser.booleanValue();
|
||||
} else {
|
||||
throw new ElasticsearchParseException("Field " + parser.currentName().toString() + " unknown for " + givenName);
|
||||
}
|
||||
token = parser.nextToken();
|
||||
}
|
||||
return newHeuristic(includeNegatives, backgroundIsSuperset);
|
||||
}
|
||||
|
||||
protected abstract SignificanceHeuristic newHeuristic(boolean includeNegatives, boolean backgroundIsSuperset);
|
||||
}
|
||||
|
||||
|
||||
protected abstract static class NXYBuilder implements SignificanceHeuristicBuilder {
|
||||
protected boolean includeNegatives = true;
|
||||
protected boolean backgroundIsSuperset = true;
|
||||
|
||||
public NXYBuilder(boolean includeNegatives, boolean backgroundIsSuperset) {
|
||||
this.includeNegatives = includeNegatives;
|
||||
this.backgroundIsSuperset = backgroundIsSuperset;
|
||||
}
|
||||
|
||||
protected void build(XContentBuilder builder) throws IOException {
|
||||
builder.field(INCLUDE_NEGATIVES_FIELD.getPreferredName(), includeNegatives)
|
||||
.field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -20,13 +20,32 @@
|
|||
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
|
||||
|
||||
|
||||
import org.elasticsearch.ElasticsearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public interface SignificanceHeuristic {
|
||||
public abstract class SignificanceHeuristic {
|
||||
/**
|
||||
* @param subsetFreq The frequency of the term in the selected sample
|
||||
* @param subsetSize The size of the selected sample (typically number of docs)
|
||||
* @param supersetFreq The frequency of the term in the superset from which the sample was taken
|
||||
* @param supersetSize The size of the superset from which the sample was taken (typically number of docs)
|
||||
* @return a "significance" score
|
||||
*/
|
||||
public abstract double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize);
|
||||
|
||||
public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize);
|
||||
abstract public void writeTo(StreamOutput out) throws IOException;
|
||||
|
||||
void writeTo(StreamOutput out) throws IOException;
|
||||
protected void checkFrequencyValidity(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize, String scoreFunctionName) {
|
||||
if (subsetFreq < 0 || subsetSize < 0 || supersetFreq < 0 || supersetSize < 0) {
|
||||
throw new ElasticsearchIllegalArgumentException("Frequencies of subset and superset must be positive in " + scoreFunctionName + ".getScore()");
|
||||
}
|
||||
if (subsetFreq > subsetSize) {
|
||||
throw new ElasticsearchIllegalArgumentException("subsetFreq > subsetSize, in JLHScore.score(..)");
|
||||
}
|
||||
if (supersetFreq > supersetSize) {
|
||||
throw new ElasticsearchIllegalArgumentException("supersetFreq > supersetSize, in JLHScore.score(..)");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,6 +35,8 @@ public class SignificantTermsHeuristicModule extends AbstractModule {
|
|||
public SignificantTermsHeuristicModule() {
|
||||
registerHeuristic(JLHScore.JLHScoreParser.class, JLHScore.STREAM);
|
||||
registerHeuristic(MutualInformation.MutualInformationParser.class, MutualInformation.STREAM);
|
||||
registerHeuristic(GND.GNDParser.class, GND.STREAM);
|
||||
registerHeuristic(ChiSquare.ChiSquareParser.class, ChiSquare.STREAM);
|
||||
}
|
||||
|
||||
public void registerHeuristic(Class<? extends SignificanceHeuristicParser> parser, SignificanceHeuristicStreams.Stream stream) {
|
||||
|
|
|
@ -163,7 +163,7 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
|
|||
}
|
||||
}
|
||||
|
||||
public static class SimpleHeuristic implements SignificanceHeuristic {
|
||||
public static class SimpleHeuristic extends SignificanceHeuristic {
|
||||
|
||||
protected static final String[] NAMES = {"simple"};
|
||||
|
||||
|
@ -259,15 +259,22 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
|
|||
|
||||
}
|
||||
|
||||
// compute significance score by
|
||||
// 1. terms agg on class and significant terms
|
||||
// 2. filter buckets and set the background to the other class and set is_background false
|
||||
// both should yield exact same result
|
||||
@Test
|
||||
public void testBackgroundVsSeparateSet() throws Exception {
|
||||
String type = randomBoolean() ? "string" : "long";
|
||||
String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}";
|
||||
index01Docs(type, settings);
|
||||
testBackgroundVsSeparateSet(new MutualInformation.MutualInformationBuilder(true, true), new MutualInformation.MutualInformationBuilder(true, false));
|
||||
testBackgroundVsSeparateSet(new ChiSquare.ChiSquareBuilder(true, true), new ChiSquare.ChiSquareBuilder(true, false));
|
||||
testBackgroundVsSeparateSet(new GND.GNDBuilder(true), new GND.GNDBuilder(false));
|
||||
}
|
||||
|
||||
// compute significance score by
|
||||
// 1. terms agg on class and significant terms
|
||||
// 2. filter buckets and set the background to the other class and set is_background false
|
||||
// both should yield exact same result
|
||||
public void testBackgroundVsSeparateSet(SignificanceHeuristicBuilder significanceHeuristicExpectingSuperset, SignificanceHeuristicBuilder significanceHeuristicExpectingSeparateSets) throws Exception {
|
||||
|
||||
SearchResponse response1 = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE)
|
||||
.addAggregation(new TermsBuilder("class")
|
||||
.field(CLASS_FIELD)
|
||||
|
@ -276,7 +283,7 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
|
|||
.field(TEXT_FIELD)
|
||||
.minDocCount(1)
|
||||
.significanceHeuristic(
|
||||
new MutualInformation.MutualInformationBuilder(true, true))))
|
||||
significanceHeuristicExpectingSuperset)))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response1);
|
||||
|
@ -287,14 +294,14 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
|
|||
.field(TEXT_FIELD)
|
||||
.minDocCount(1)
|
||||
.backgroundFilter(FilterBuilders.termFilter(CLASS_FIELD, "1"))
|
||||
.significanceHeuristic(new MutualInformation.MutualInformationBuilder(true, false))))
|
||||
.significanceHeuristic(significanceHeuristicExpectingSeparateSets)))
|
||||
.addAggregation((new FilterAggregationBuilder("1"))
|
||||
.filter(FilterBuilders.termFilter(CLASS_FIELD, "1"))
|
||||
.subAggregation(new SignificantTermsBuilder("sig_terms")
|
||||
.field(TEXT_FIELD)
|
||||
.minDocCount(1)
|
||||
.backgroundFilter(FilterBuilders.termFilter(CLASS_FIELD, "0"))
|
||||
.significanceHeuristic(new MutualInformation.MutualInformationBuilder(true, false))))
|
||||
.significanceHeuristic(significanceHeuristicExpectingSeparateSets)))
|
||||
.execute()
|
||||
.actionGet();
|
||||
|
||||
|
@ -302,7 +309,7 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
|
|||
assertThat(sigTerms0.getBuckets().size(), equalTo(2));
|
||||
double score00Background = sigTerms0.getBucketByKey("0").getSignificanceScore();
|
||||
double score01Background = sigTerms0.getBucketByKey("1").getSignificanceScore();
|
||||
SignificantTerms sigTerms1 = ((SignificantTerms) (((StringTerms) response1.getAggregations().get("class")).getBucketByKey("0").getAggregations().asMap().get("sig_terms")));
|
||||
SignificantTerms sigTerms1 = ((SignificantTerms) (((StringTerms) response1.getAggregations().get("class")).getBucketByKey("1").getAggregations().asMap().get("sig_terms")));
|
||||
double score10Background = sigTerms1.getBucketByKey("0").getSignificanceScore();
|
||||
double score11Background = sigTerms1.getBucketByKey("1").getSignificanceScore();
|
||||
|
||||
|
@ -340,14 +347,20 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testMutualInformationEqual() throws Exception {
|
||||
public void testScoresEqualForPositiveAndNegative() throws Exception {
|
||||
indexEqualTestData();
|
||||
//now, check that results for both classes are the same with exclude negatives = false and classes are routing ids
|
||||
testScoresEqualForPositiveAndNegative(new MutualInformation.MutualInformationBuilder(true, true));
|
||||
testScoresEqualForPositiveAndNegative(new ChiSquare.ChiSquareBuilder(true, true));
|
||||
}
|
||||
|
||||
public void testScoresEqualForPositiveAndNegative(SignificanceHeuristicBuilder heuristic) throws Exception {
|
||||
|
||||
//check that results for both classes are the same with exclude negatives = false and classes are routing ids
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.addAggregation(new TermsBuilder("class").field("class").subAggregation(new SignificantTermsBuilder("mySignificantTerms")
|
||||
.field("text")
|
||||
.executionHint(randomExecutionHint())
|
||||
.significanceHeuristic(new MutualInformation.MutualInformationBuilder(true, true))
|
||||
.significanceHeuristic(heuristic)
|
||||
.minDocCount(1).shardSize(1000).size(1000)))
|
||||
.execute()
|
||||
.actionGet();
|
||||
|
|
|
@ -29,6 +29,8 @@ import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms
|
|||
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms.Bucket;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsAggregatorFactory.ExecutionMode;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsBuilder;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation;
|
||||
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
|
||||
|
@ -186,8 +188,38 @@ public class SignificantTermsTests extends ElasticsearchIntegrationTest {
|
|||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
checkExpectedStringTermsFound(topTerms);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void textAnalysisGND() throws Exception {
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setSearchType(SearchType.QUERY_AND_FETCH)
|
||||
.setQuery(new TermQueryBuilder("_all", "terje"))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description").executionHint(randomExecutionHint()).significanceHeuristic(new GND.GNDBuilder(true))
|
||||
.minDocCount(2))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
checkExpectedStringTermsFound(topTerms);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void textAnalysisChiSquare() throws Exception {
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setSearchType(SearchType.QUERY_AND_FETCH)
|
||||
.setQuery(new TermQueryBuilder("_all", "terje"))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description").executionHint(randomExecutionHint()).significanceHeuristic(new ChiSquare.ChiSquareBuilder(false,true))
|
||||
.minDocCount(2))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
checkExpectedStringTermsFound(topTerms);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void badFilteredAnalysis() throws Exception {
|
||||
// Deliberately using a bad choice of filter here for the background context in order
|
||||
|
|
|
@ -39,8 +39,10 @@ import org.junit.Test;
|
|||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.hamcrest.Matchers.*;
|
||||
|
@ -66,6 +68,8 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
|
|||
public void streamResponse() throws Exception {
|
||||
SignificanceHeuristicStreams.registerStream(MutualInformation.STREAM, MutualInformation.STREAM.getName());
|
||||
SignificanceHeuristicStreams.registerStream(JLHScore.STREAM, JLHScore.STREAM.getName());
|
||||
SignificanceHeuristicStreams.registerStream(GND.STREAM, GND.STREAM.getName());
|
||||
SignificanceHeuristicStreams.registerStream(ChiSquare.STREAM, ChiSquare.STREAM.getName());
|
||||
Version version = ElasticsearchIntegrationTest.randomVersion();
|
||||
InternalSignificantTerms[] sigTerms = getRandomSignificantTerms(getRandomSignificanceheuristic());
|
||||
|
||||
|
@ -109,11 +113,12 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
|
|||
}
|
||||
|
||||
SignificanceHeuristic getRandomSignificanceheuristic() {
|
||||
if (randomBoolean()) {
|
||||
return JLHScore.INSTANCE;
|
||||
} else {
|
||||
return new MutualInformation(randomBoolean(), true);
|
||||
}
|
||||
List<SignificanceHeuristic> heuristics = new ArrayList<>();
|
||||
heuristics.add(JLHScore.INSTANCE);
|
||||
heuristics.add(new MutualInformation(randomBoolean(), randomBoolean()));
|
||||
heuristics.add(new GND(randomBoolean()));
|
||||
heuristics.add(new ChiSquare(randomBoolean(), randomBoolean()));
|
||||
return heuristics.get(randomInt(3));
|
||||
}
|
||||
|
||||
// test that
|
||||
|
@ -125,110 +130,111 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
|
|||
Set<SignificanceHeuristicParser> parsers = new HashSet<>();
|
||||
parsers.add(new JLHScore.JLHScoreParser());
|
||||
parsers.add(new MutualInformation.MutualInformationParser());
|
||||
parsers.add(new GND.GNDParser());
|
||||
parsers.add(new ChiSquare.ChiSquareParser());
|
||||
SignificanceHeuristicParserMapper heuristicParserMapper = new SignificanceHeuristicParserMapper(parsers);
|
||||
SearchContext searchContext = new SignificantTermsTestSearchContext();
|
||||
|
||||
// test default with string
|
||||
XContentParser stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", \"jlh\":{}, \"min_doc_count\":200}");
|
||||
// test jlh with string
|
||||
assertTrue(parseFromString(heuristicParserMapper, searchContext, "\"jlh\":{}") instanceof JLHScore);
|
||||
// test gnd with string
|
||||
assertTrue(parseFromString(heuristicParserMapper, searchContext, "\"gnd\":{}") instanceof GND);
|
||||
// test mutual information with string
|
||||
boolean includeNegatives = randomBoolean();
|
||||
boolean backgroundIsSuperset = randomBoolean();
|
||||
assertThat(parseFromString(heuristicParserMapper, searchContext, "\"mutual_information\":{\"include_negatives\": " + includeNegatives + ", \"background_is_superset\":" + backgroundIsSuperset + "}"), equalTo((SignificanceHeuristic) (new MutualInformation(includeNegatives, backgroundIsSuperset))));
|
||||
assertThat(parseFromString(heuristicParserMapper, searchContext, "\"chi_square\":{\"include_negatives\": " + includeNegatives + ", \"background_is_superset\":" + backgroundIsSuperset + "}"), equalTo((SignificanceHeuristic) (new ChiSquare(includeNegatives, backgroundIsSuperset))));
|
||||
|
||||
// test with builders
|
||||
assertTrue(parseFromBuilder(heuristicParserMapper, searchContext, new JLHScore.JLHScoreBuilder()) instanceof JLHScore);
|
||||
assertTrue(parseFromBuilder(heuristicParserMapper, searchContext, new GND.GNDBuilder(backgroundIsSuperset)) instanceof GND);
|
||||
assertThat(parseFromBuilder(heuristicParserMapper, searchContext, new MutualInformation.MutualInformationBuilder(includeNegatives, backgroundIsSuperset)), equalTo((SignificanceHeuristic) new MutualInformation(includeNegatives, backgroundIsSuperset)));
|
||||
assertThat(parseFromBuilder(heuristicParserMapper, searchContext, new ChiSquare.ChiSquareBuilder(includeNegatives, backgroundIsSuperset)), equalTo((SignificanceHeuristic) new ChiSquare(includeNegatives, backgroundIsSuperset)));
|
||||
|
||||
// test exceptions
|
||||
String faultyHeuristicdefinition = "\"mutual_information\":{\"include_negatives\": false, \"some_unknown_field\": false}";
|
||||
String expectedError = "unknown for mutual_information";
|
||||
checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError);
|
||||
|
||||
faultyHeuristicdefinition = "\"chi_square\":{\"unknown_field\": true}";
|
||||
expectedError = "unknown for chi_square";
|
||||
checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError);
|
||||
|
||||
faultyHeuristicdefinition = "\"jlh\":{\"unknown_field\": true}";
|
||||
expectedError = "expected }, got ";
|
||||
checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError);
|
||||
|
||||
faultyHeuristicdefinition = "\"gnd\":{\"unknown_field\": true}";
|
||||
expectedError = "unknown for gnd";
|
||||
checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError);
|
||||
}
|
||||
|
||||
protected void checkParseException(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, String faultyHeuristicDefinition, String expectedError) throws IOException {
|
||||
try {
|
||||
XContentParser stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", " + faultyHeuristicDefinition + ",\"min_doc_count\":200}");
|
||||
stParser.nextToken();
|
||||
new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
|
||||
fail();
|
||||
} catch (ElasticsearchParseException e) {
|
||||
assertTrue(e.getMessage().contains(expectedError));
|
||||
}
|
||||
}
|
||||
|
||||
protected SignificanceHeuristic parseFromBuilder(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, SignificanceHeuristicBuilder significanceHeuristicBuilder) throws IOException {
|
||||
SignificantTermsBuilder stBuilder = new SignificantTermsBuilder("testagg");
|
||||
stBuilder.significanceHeuristic(significanceHeuristicBuilder).field("text").minDocCount(200);
|
||||
XContentBuilder stXContentBuilder = XContentFactory.jsonBuilder();
|
||||
stBuilder.internalXContent(stXContentBuilder, null);
|
||||
XContentParser stParser = JsonXContent.jsonXContent.createParser(stXContentBuilder.string());
|
||||
return parseSignificanceHeuristic(heuristicParserMapper, searchContext, stParser);
|
||||
}
|
||||
|
||||
private SignificanceHeuristic parseSignificanceHeuristic(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, XContentParser stParser) throws IOException {
|
||||
stParser.nextToken();
|
||||
SignificantTermsAggregatorFactory aggregatorFactory = (SignificantTermsAggregatorFactory) new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
|
||||
stParser.nextToken();
|
||||
assertThat(aggregatorFactory.getBucketCountThresholds().getMinDocCount(), equalTo(200l));
|
||||
assertThat(stParser.currentToken(), equalTo(null));
|
||||
stParser.close();
|
||||
|
||||
// test default with builders
|
||||
SignificantTermsBuilder stBuilder = new SignificantTermsBuilder("testagg");
|
||||
stBuilder.significanceHeuristic(new JLHScore.JLHScoreBuilder()).field("text").minDocCount(200);
|
||||
XContentBuilder stXContentBuilder = XContentFactory.jsonBuilder();
|
||||
stBuilder.internalXContent(stXContentBuilder, null);
|
||||
stParser = JsonXContent.jsonXContent.createParser(stXContentBuilder.string());
|
||||
stParser.nextToken();
|
||||
aggregatorFactory = (SignificantTermsAggregatorFactory) new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
|
||||
stParser.nextToken();
|
||||
assertThat(aggregatorFactory.getBucketCountThresholds().getMinDocCount(), equalTo(200l));
|
||||
assertThat(stParser.currentToken(), equalTo(null));
|
||||
stParser.close();
|
||||
|
||||
// test mutual_information with string
|
||||
stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", \"mutual_information\":{\"include_negatives\": false}, \"min_doc_count\":200}");
|
||||
stParser.nextToken();
|
||||
aggregatorFactory = (SignificantTermsAggregatorFactory) new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
|
||||
stParser.nextToken();
|
||||
assertThat(aggregatorFactory.getBucketCountThresholds().getMinDocCount(), equalTo(200l));
|
||||
assertTrue(!((MutualInformation) aggregatorFactory.getSignificanceHeuristic()).getIncludeNegatives());
|
||||
assertThat(stParser.currentToken(), equalTo(null));
|
||||
stParser.close();
|
||||
|
||||
// test mutual_information with builders
|
||||
stBuilder = new SignificantTermsBuilder("testagg");
|
||||
stBuilder.significanceHeuristic(new MutualInformation.MutualInformationBuilder(false, true)).field("text").minDocCount(200);
|
||||
stXContentBuilder = XContentFactory.jsonBuilder();
|
||||
stBuilder.internalXContent(stXContentBuilder, null);
|
||||
stParser = JsonXContent.jsonXContent.createParser(stXContentBuilder.string());
|
||||
stParser.nextToken();
|
||||
aggregatorFactory = (SignificantTermsAggregatorFactory) new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
|
||||
stParser.nextToken();
|
||||
assertThat(aggregatorFactory.getBucketCountThresholds().getMinDocCount(), equalTo(200l));
|
||||
assertTrue(!((MutualInformation) aggregatorFactory.getSignificanceHeuristic()).getIncludeNegatives());
|
||||
assertThat(stParser.currentToken(), equalTo(null));
|
||||
stParser.close();
|
||||
|
||||
// test exceptions
|
||||
try {
|
||||
// 1. invalid field
|
||||
stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", \"mutual_information\":{\"include_negatives\": false, \"some_unknown_field\": false}\"min_doc_count\":200}");
|
||||
stParser.nextToken();
|
||||
new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
|
||||
fail();
|
||||
} catch (ElasticsearchParseException e) {
|
||||
assertTrue(e.getMessage().contains("unknown for mutual_information"));
|
||||
}
|
||||
|
||||
try {
|
||||
// 2. unknown field in jlh_score
|
||||
stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", \"jlh\":{\"unknown_field\": true}, \"min_doc_count\":200}");
|
||||
stParser.nextToken();
|
||||
new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext);
|
||||
fail();
|
||||
} catch (ElasticsearchParseException e) {
|
||||
assertTrue(e.getMessage().contains("expected }, got "));
|
||||
}
|
||||
return aggregatorFactory.getSignificanceHeuristic();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAssertions() throws Exception {
|
||||
MutualInformation mutualInformation = new MutualInformation(true, true);
|
||||
protected SignificanceHeuristic parseFromString(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, String heuristicString) throws IOException {
|
||||
XContentParser stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", " + heuristicString + ", \"min_doc_count\":200}");
|
||||
return parseSignificanceHeuristic(heuristicParserMapper, searchContext, stParser);
|
||||
}
|
||||
|
||||
void testBackgroundAssertions(SignificanceHeuristic heuristicIsSuperset, SignificanceHeuristic heuristicNotSuperset) {
|
||||
try {
|
||||
mutualInformation.getScore(2, 3, 1, 4);
|
||||
heuristicIsSuperset.getScore(2, 3, 1, 4);
|
||||
fail();
|
||||
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
|
||||
assertNotNull(illegalArgumentException.getMessage());
|
||||
assertTrue(illegalArgumentException.getMessage().contains("subsetFreq > supersetFreq"));
|
||||
}
|
||||
try {
|
||||
mutualInformation.getScore(1, 4, 2, 3);
|
||||
heuristicIsSuperset.getScore(1, 4, 2, 3);
|
||||
fail();
|
||||
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
|
||||
assertNotNull(illegalArgumentException.getMessage());
|
||||
assertTrue(illegalArgumentException.getMessage().contains("subsetSize > supersetSize"));
|
||||
}
|
||||
try {
|
||||
mutualInformation.getScore(2, 1, 3, 4);
|
||||
heuristicIsSuperset.getScore(2, 1, 3, 4);
|
||||
fail();
|
||||
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
|
||||
assertNotNull(illegalArgumentException.getMessage());
|
||||
assertTrue(illegalArgumentException.getMessage().contains("subsetFreq > subsetSize"));
|
||||
}
|
||||
try {
|
||||
mutualInformation.getScore(1, 2, 4, 3);
|
||||
heuristicIsSuperset.getScore(1, 2, 4, 3);
|
||||
fail();
|
||||
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
|
||||
assertNotNull(illegalArgumentException.getMessage());
|
||||
assertTrue(illegalArgumentException.getMessage().contains("supersetFreq > supersetSize"));
|
||||
}
|
||||
try {
|
||||
mutualInformation.getScore(1, 3, 4, 4);
|
||||
heuristicIsSuperset.getScore(1, 3, 4, 4);
|
||||
fail();
|
||||
} catch (ElasticsearchIllegalArgumentException assertionError) {
|
||||
assertNotNull(assertionError.getMessage());
|
||||
|
@ -238,70 +244,58 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
|
|||
int idx = randomInt(3);
|
||||
long[] values = {1, 2, 3, 4};
|
||||
values[idx] *= -1;
|
||||
mutualInformation.getScore(values[0], values[1], values[2], values[3]);
|
||||
heuristicIsSuperset.getScore(values[0], values[1], values[2], values[3]);
|
||||
fail();
|
||||
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
|
||||
assertNotNull(illegalArgumentException.getMessage());
|
||||
assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive"));
|
||||
}
|
||||
mutualInformation = new MutualInformation(true, false);
|
||||
double score = mutualInformation.getScore(2, 3, 1, 4);
|
||||
assertThat(score, greaterThanOrEqualTo(0.0));
|
||||
assertThat(score, lessThanOrEqualTo(1.0));
|
||||
score = mutualInformation.getScore(1, 4, 2, 3);
|
||||
assertThat(score, greaterThanOrEqualTo(0.0));
|
||||
assertThat(score, lessThanOrEqualTo(1.0));
|
||||
|
||||
try {
|
||||
mutualInformation.getScore(2, 1, 3, 4);
|
||||
heuristicNotSuperset.getScore(2, 1, 3, 4);
|
||||
fail();
|
||||
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
|
||||
assertNotNull(illegalArgumentException.getMessage());
|
||||
assertTrue(illegalArgumentException.getMessage().contains("subsetFreq > subsetSize"));
|
||||
}
|
||||
try {
|
||||
mutualInformation.getScore(1, 2, 4, 3);
|
||||
fail();
|
||||
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
|
||||
assertNotNull(illegalArgumentException.getMessage());
|
||||
assertTrue(illegalArgumentException.getMessage().contains("supersetFreq > supersetSize"));
|
||||
}
|
||||
|
||||
score = mutualInformation.getScore(1, 3, 4, 4);
|
||||
assertThat(score, greaterThanOrEqualTo(0.0));
|
||||
assertThat(score, lessThanOrEqualTo(1.0));
|
||||
|
||||
try {
|
||||
int idx = randomInt(3);
|
||||
long[] values = {1, 2, 3, 4};
|
||||
values[idx] *= -1;
|
||||
mutualInformation.getScore(values[0], values[1], values[2], values[3]);
|
||||
fail();
|
||||
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
|
||||
assertNotNull(illegalArgumentException.getMessage());
|
||||
assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive"));
|
||||
}
|
||||
|
||||
JLHScore jlhScore = JLHScore.INSTANCE;
|
||||
try {
|
||||
int idx = randomInt(3);
|
||||
long[] values = {1, 2, 3, 4};
|
||||
values[idx] *= -1;
|
||||
jlhScore.getScore(values[0], values[1], values[2], values[3]);
|
||||
fail();
|
||||
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
|
||||
assertNotNull(illegalArgumentException.getMessage());
|
||||
assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive"));
|
||||
}
|
||||
try {
|
||||
jlhScore.getScore(1, 2, 4, 3);
|
||||
heuristicNotSuperset.getScore(1, 2, 4, 3);
|
||||
fail();
|
||||
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
|
||||
assertNotNull(illegalArgumentException.getMessage());
|
||||
assertTrue(illegalArgumentException.getMessage().contains("supersetFreq > supersetSize"));
|
||||
}
|
||||
try {
|
||||
jlhScore.getScore(2, 1, 3, 4);
|
||||
int idx = randomInt(3);
|
||||
long[] values = {1, 2, 3, 4};
|
||||
values[idx] *= -1;
|
||||
heuristicNotSuperset.getScore(values[0], values[1], values[2], values[3]);
|
||||
fail();
|
||||
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
|
||||
assertNotNull(illegalArgumentException.getMessage());
|
||||
assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive"));
|
||||
}
|
||||
}
|
||||
|
||||
void testAssertions(SignificanceHeuristic heuristic) {
|
||||
try {
|
||||
int idx = randomInt(3);
|
||||
long[] values = {1, 2, 3, 4};
|
||||
values[idx] *= -1;
|
||||
heuristic.getScore(values[0], values[1], values[2], values[3]);
|
||||
fail();
|
||||
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
|
||||
assertNotNull(illegalArgumentException.getMessage());
|
||||
assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive"));
|
||||
}
|
||||
try {
|
||||
heuristic.getScore(1, 2, 4, 3);
|
||||
fail();
|
||||
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
|
||||
assertNotNull(illegalArgumentException.getMessage());
|
||||
assertTrue(illegalArgumentException.getMessage().contains("supersetFreq > supersetSize"));
|
||||
}
|
||||
try {
|
||||
heuristic.getScore(2, 1, 3, 4);
|
||||
fail();
|
||||
} catch (ElasticsearchIllegalArgumentException illegalArgumentException) {
|
||||
assertNotNull(illegalArgumentException.getMessage());
|
||||
|
@ -310,11 +304,30 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void scoreDefault() {
|
||||
SignificanceHeuristic heuristic = JLHScore.INSTANCE;
|
||||
public void testAssertions() throws Exception {
|
||||
testBackgroundAssertions(new MutualInformation(true, true), new MutualInformation(true, false));
|
||||
testBackgroundAssertions(new ChiSquare(true, true), new ChiSquare(true, false));
|
||||
testBackgroundAssertions(new GND(true), new GND(false));
|
||||
testAssertions(JLHScore.INSTANCE);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void basicScoreProperties() {
|
||||
basicScoreProperties(JLHScore.INSTANCE, true);
|
||||
basicScoreProperties(new GND(true), true);
|
||||
basicScoreProperties(new MutualInformation(true, true), false);
|
||||
basicScoreProperties(new ChiSquare(true, true), false);
|
||||
}
|
||||
|
||||
public void basicScoreProperties(SignificanceHeuristic heuristic, boolean test0) {
|
||||
|
||||
assertThat(heuristic.getScore(1, 1, 1, 3), greaterThan(0.0));
|
||||
assertThat(heuristic.getScore(1, 1, 2, 3), lessThan(heuristic.getScore(1, 1, 1, 3)));
|
||||
assertThat(heuristic.getScore(0, 1, 2, 3), equalTo(0.0));
|
||||
assertThat(heuristic.getScore(1, 1, 3, 4), lessThan(heuristic.getScore(1, 1, 2, 4)));
|
||||
if (test0) {
|
||||
assertThat(heuristic.getScore(0, 1, 2, 3), equalTo(0.0));
|
||||
}
|
||||
|
||||
double score = 0.0;
|
||||
try {
|
||||
long a = randomLong();
|
||||
|
@ -350,7 +363,34 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
|
|||
assertThat(score, lessThanOrEqualTo(1.0));
|
||||
assertThat(score, greaterThanOrEqualTo(0.0));
|
||||
heuristic = new MutualInformation(false, true);
|
||||
assertThat(heuristic.getScore(0, 1, 2, 3), equalTo(-1.0 * Double.MAX_VALUE));
|
||||
assertThat(heuristic.getScore(0, 1, 2, 3), equalTo(Double.NEGATIVE_INFINITY));
|
||||
|
||||
heuristic = new MutualInformation(true, false);
|
||||
score = heuristic.getScore(2, 3, 1, 4);
|
||||
assertThat(score, greaterThanOrEqualTo(0.0));
|
||||
assertThat(score, lessThanOrEqualTo(1.0));
|
||||
score = heuristic.getScore(1, 4, 2, 3);
|
||||
assertThat(score, greaterThanOrEqualTo(0.0));
|
||||
assertThat(score, lessThanOrEqualTo(1.0));
|
||||
score = heuristic.getScore(1, 3, 4, 4);
|
||||
assertThat(score, greaterThanOrEqualTo(0.0));
|
||||
assertThat(score, lessThanOrEqualTo(1.0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGNDCornerCases() throws Exception {
|
||||
GND gnd = new GND(true);
|
||||
//term is only in the subset, not at all in the other set but that is because the other set is empty.
|
||||
// this should actually not happen because only terms that are in the subset are considered now,
|
||||
// however, in this case the score should be 0 because a term that does not exist cannot be relevant...
|
||||
assertThat(gnd.getScore(0, randomIntBetween(1, 2), 0, randomIntBetween(2,3)), equalTo(0.0));
|
||||
// the terms do not co-occur at all - should be 0
|
||||
assertThat(gnd.getScore(0, randomIntBetween(1, 2), randomIntBetween(2, 3), randomIntBetween(5,6)), equalTo(0.0));
|
||||
// comparison between two terms that do not exist - probably not relevant
|
||||
assertThat(gnd.getScore(0, 0, 0, randomIntBetween(1,2)), equalTo(0.0));
|
||||
// terms co-occur perfectly - should be 1
|
||||
assertThat(gnd.getScore(1, 1, 1, 1), equalTo(1.0));
|
||||
gnd = new GND(false);
|
||||
assertThat(gnd.getScore(0, 0, 0, 0), equalTo(0.0));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue