[7.x][ML] Additional outlier detection parameters (#47600) (#47669)

Adds the following parameters to `outlier_detection`:

- `compute_feature_influence` (boolean): whether to compute or not
   feature influence scores
- `outlier_fraction` (double): the proportion of the data set assumed
   to be outlying prior to running outlier detection
- `standardization_enabled` (boolean): whether to apply standardization
   to the feature values

Backport of #47600
This commit is contained in:
Dimitris Athanasiou 2019-10-07 18:21:33 +03:00 committed by GitHub
parent 833ed30f0d
commit 7667ea5f6f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 563 additions and 69 deletions

View File

@ -19,7 +19,6 @@
package org.elasticsearch.client.ml.dataframe;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.xcontent.ObjectParser;
@ -48,6 +47,9 @@ public class OutlierDetection implements DataFrameAnalysis {
static final ParseField N_NEIGHBORS = new ParseField("n_neighbors");
static final ParseField METHOD = new ParseField("method");
public static final ParseField FEATURE_INFLUENCE_THRESHOLD = new ParseField("feature_influence_threshold");
static final ParseField COMPUTE_FEATURE_INFLUENCE = new ParseField("compute_feature_influence");
static final ParseField OUTLIER_FRACTION = new ParseField("outlier_fraction");
static final ParseField STANDARDIZATION_ENABLED = new ParseField("standardization_enabled");
private static ObjectParser<Builder, Void> PARSER = new ObjectParser<>(NAME.getPreferredName(), true, Builder::new);
@ -60,22 +62,49 @@ public class OutlierDetection implements DataFrameAnalysis {
throw new IllegalArgumentException("Unsupported token [" + p.currentToken() + "]");
}, METHOD, ObjectParser.ValueType.STRING);
PARSER.declareDouble(Builder::setFeatureInfluenceThreshold, FEATURE_INFLUENCE_THRESHOLD);
PARSER.declareBoolean(Builder::setComputeFeatureInfluence, COMPUTE_FEATURE_INFLUENCE);
PARSER.declareDouble(Builder::setOutlierFraction, OUTLIER_FRACTION);
PARSER.declareBoolean(Builder::setStandardizationEnabled, STANDARDIZATION_ENABLED);
}
/**
* The number of neighbors. Leave unspecified for dynamic detection.
*/
private final Integer nNeighbors;
/**
* The method. Leave unspecified for a dynamic mixture of methods.
*/
private final Method method;
/**
* The min outlier score required to calculate feature influence. Defaults to 0.1.
*/
private final Double featureInfluenceThreshold;
/**
* Constructs the outlier detection configuration
* @param nNeighbors The number of neighbors. Leave unspecified for dynamic detection.
* @param method The method. Leave unspecified for a dynamic mixture of methods.
* @param featureInfluenceThreshold The min outlier score required to calculate feature influence. Defaults to 0.1.
* Whether to compute feature influence or not. Defaults to true.
*/
private OutlierDetection(@Nullable Integer nNeighbors, @Nullable Method method, @Nullable Double featureInfluenceThreshold) {
private final Boolean computeFeatureInfluence;
/**
* The proportion of data assumed to be outlying prior to outlier detection. Defaults to 0.05.
*/
private final Double outlierFraction;
/**
* Whether to perform standardization.
*/
private final Boolean standardizationEnabled;
private OutlierDetection(Integer nNeighbors, Method method, Double featureInfluenceThreshold, Boolean computeFeatureInfluence,
Double outlierFraction, Boolean standardizationEnabled) {
this.nNeighbors = nNeighbors;
this.method = method;
this.featureInfluenceThreshold = featureInfluenceThreshold;
this.computeFeatureInfluence = computeFeatureInfluence;
this.outlierFraction = outlierFraction;
this.standardizationEnabled = standardizationEnabled;
}
@Override
@ -95,6 +124,18 @@ public class OutlierDetection implements DataFrameAnalysis {
return featureInfluenceThreshold;
}
public Boolean getComputeFeatureInfluence() {
return computeFeatureInfluence;
}
public Double getOutlierFraction() {
return outlierFraction;
}
public Boolean getStandardizationEnabled() {
return standardizationEnabled;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
@ -107,6 +148,15 @@ public class OutlierDetection implements DataFrameAnalysis {
if (featureInfluenceThreshold != null) {
builder.field(FEATURE_INFLUENCE_THRESHOLD.getPreferredName(), featureInfluenceThreshold);
}
if (computeFeatureInfluence != null) {
builder.field(COMPUTE_FEATURE_INFLUENCE.getPreferredName(), computeFeatureInfluence);
}
if (outlierFraction != null) {
builder.field(OUTLIER_FRACTION.getPreferredName(), outlierFraction);
}
if (standardizationEnabled != null) {
builder.field(STANDARDIZATION_ENABLED.getPreferredName(), standardizationEnabled);
}
builder.endObject();
return builder;
}
@ -119,12 +169,16 @@ public class OutlierDetection implements DataFrameAnalysis {
OutlierDetection other = (OutlierDetection) o;
return Objects.equals(nNeighbors, other.nNeighbors)
&& Objects.equals(method, other.method)
&& Objects.equals(featureInfluenceThreshold, other.featureInfluenceThreshold);
&& Objects.equals(featureInfluenceThreshold, other.featureInfluenceThreshold)
&& Objects.equals(computeFeatureInfluence, other.computeFeatureInfluence)
&& Objects.equals(outlierFraction, other.outlierFraction)
&& Objects.equals(standardizationEnabled, other.standardizationEnabled);
}
@Override
public int hashCode() {
return Objects.hash(nNeighbors, method, featureInfluenceThreshold);
return Objects.hash(nNeighbors, method, featureInfluenceThreshold, computeFeatureInfluence, outlierFraction,
standardizationEnabled);
}
@Override
@ -150,6 +204,9 @@ public class OutlierDetection implements DataFrameAnalysis {
private Integer nNeighbors;
private Method method;
private Double featureInfluenceThreshold;
private Boolean computeFeatureInfluence;
private Double outlierFraction;
private Boolean standardizationEnabled;
private Builder() {}
@ -168,8 +225,24 @@ public class OutlierDetection implements DataFrameAnalysis {
return this;
}
public Builder setComputeFeatureInfluence(Boolean computeFeatureInfluence) {
this.computeFeatureInfluence = computeFeatureInfluence;
return this;
}
public Builder setOutlierFraction(Double outlierFraction) {
this.outlierFraction = outlierFraction;
return this;
}
public Builder setStandardizationEnabled(Boolean standardizationEnabled) {
this.standardizationEnabled = standardizationEnabled;
return this;
}
public OutlierDetection build() {
return new OutlierDetection(nNeighbors, method, featureInfluenceThreshold);
return new OutlierDetection(nNeighbors, method, featureInfluenceThreshold, computeFeatureInfluence, outlierFraction,
standardizationEnabled);
}
}
}

View File

@ -1276,7 +1276,10 @@ public class MachineLearningIT extends ESRestHighLevelClientTestCase {
assertThat(createdConfig.getSource().getQueryConfig(), equalTo(new QueryConfig(new MatchAllQueryBuilder()))); // default value
assertThat(createdConfig.getDest().getIndex(), equalTo(config.getDest().getIndex()));
assertThat(createdConfig.getDest().getResultsField(), equalTo("ml")); // default value
assertThat(createdConfig.getAnalysis(), equalTo(config.getAnalysis()));
assertThat(createdConfig.getAnalysis(), equalTo(OutlierDetection.builder()
.setComputeFeatureInfluence(true)
.setOutlierFraction(0.05)
.setStandardizationEnabled(true).build()));
assertThat(createdConfig.getAnalyzedFields(), equalTo(config.getAnalyzedFields()));
assertThat(createdConfig.getModelMemoryLimit(), equalTo(ByteSizeValue.parseBytesSizeValue("1gb", ""))); // default value
assertThat(createdConfig.getDescription(), equalTo("some description"));

View File

@ -2932,6 +2932,10 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
DataFrameAnalysis outlierDetectionCustomized = OutlierDetection.builder() // <1>
.setMethod(OutlierDetection.Method.DISTANCE_KNN) // <2>
.setNNeighbors(5) // <3>
.setFeatureInfluenceThreshold(0.1) // <4>
.setComputeFeatureInfluence(true) // <5>
.setOutlierFraction(0.05) // <6>
.setStandardizationEnabled(true) // <7>
.build();
// end::put-data-frame-analytics-outlier-detection-customized

View File

@ -26,6 +26,7 @@ import java.io.IOException;
import static org.hamcrest.Matchers.closeTo;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.is;
public class OutlierDetectionTests extends AbstractXContentTestCase<OutlierDetection> {
@ -34,6 +35,9 @@ public class OutlierDetectionTests extends AbstractXContentTestCase<OutlierDetec
.setNNeighbors(randomBoolean() ? null : randomIntBetween(1, 20))
.setMethod(randomBoolean() ? null : randomFrom(OutlierDetection.Method.values()))
.setFeatureInfluenceThreshold(randomBoolean() ? null : randomDoubleBetween(0.0, 1.0, true))
.setComputeFeatureInfluence(randomBoolean() ? null : randomBoolean())
.setOutlierFraction(randomBoolean() ? null : randomDoubleBetween(0.0, 1.0, true))
.setStandardizationEnabled(randomBoolean() ? null : randomBoolean())
.build();
}
@ -57,6 +61,9 @@ public class OutlierDetectionTests extends AbstractXContentTestCase<OutlierDetec
assertNull(outlierDetection.getNNeighbors());
assertNull(outlierDetection.getMethod());
assertNull(outlierDetection.getFeatureInfluenceThreshold());
assertNull(outlierDetection.getComputeFeatureInfluence());
assertNull(outlierDetection.getOutlierFraction());
assertNull(outlierDetection.getStandardizationEnabled());
}
public void testGetParams_GivenExplicitValues() {
@ -65,9 +72,15 @@ public class OutlierDetectionTests extends AbstractXContentTestCase<OutlierDetec
.setNNeighbors(42)
.setMethod(OutlierDetection.Method.LDOF)
.setFeatureInfluenceThreshold(0.5)
.setComputeFeatureInfluence(true)
.setOutlierFraction(0.42)
.setStandardizationEnabled(false)
.build();
assertThat(outlierDetection.getNNeighbors(), equalTo(42));
assertThat(outlierDetection.getMethod(), equalTo(OutlierDetection.Method.LDOF));
assertThat(outlierDetection.getFeatureInfluenceThreshold(), closeTo(0.5, 1E-9));
assertThat(outlierDetection.getComputeFeatureInfluence(), is(true));
assertThat(outlierDetection.getOutlierFraction(), closeTo(0.42, 1E-9));
assertThat(outlierDetection.getStandardizationEnabled(), is(false));
}
}

View File

@ -96,6 +96,10 @@ include-tagged::{doc-tests-file}[{api}-outlier-detection-customized]
<1> Constructing a new OutlierDetection object
<2> The method used to perform the analysis
<3> Number of neighbors taken into account during analysis
<4> The min `outlier_score` required to compute feature influence
<5> Whether to compute feature influence
<6> The proportion of the data set that is assumed to be outlying prior to outlier detection
<7> Whether to apply standardization to feature values
===== Regression

View File

@ -134,7 +134,7 @@ An `outlier_detection` configuration object has the following properties:
{oldetection}. For example, 0.05 means it is assumed that 5% of values are real outliers
and 95% are inliers.
`standardize_columns`::
`standardization_enabled`::
(boolean) If `true`, then the following operation is performed on the columns
before computing outlier scores: (x_i - mean(x_i)) / sd(x_i). Defaults to
`true`. For more information, see

View File

@ -140,6 +140,9 @@ PUT _ml/data_frame/analytics/loganalytics
},
"analysis": {
"outlier_detection": {
"compute_feature_influence": true,
"outlier_fraction": 0.05,
"standardization_enabled": true
}
}
}
@ -166,8 +169,12 @@ The API returns the following result:
"index" : "logdata_out",
"results_field" : "ml"
},
"analysis" : {
"outlier_detection" : { }
"analysis": {
"outlier_detection": {
"compute_feature_influence": true,
"outlier_fraction": 0.05,
"standardization_enabled": true
}
},
"model_memory_limit" : "1gb",
"create_time" : 1562351429434,

View File

@ -5,11 +5,11 @@
*/
package org.elasticsearch.xpack.core.ml.dataframe.analyses;
import org.elasticsearch.Version;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.ObjectParser;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
@ -30,39 +30,68 @@ public class OutlierDetection implements DataFrameAnalysis {
public static final ParseField N_NEIGHBORS = new ParseField("n_neighbors");
public static final ParseField METHOD = new ParseField("method");
public static final ParseField FEATURE_INFLUENCE_THRESHOLD = new ParseField("feature_influence_threshold");
public static final ParseField COMPUTE_FEATURE_INFLUENCE = new ParseField("compute_feature_influence");
public static final ParseField OUTLIER_FRACTION = new ParseField("outlier_fraction");
public static final ParseField STANDARDIZATION_ENABLED = new ParseField("standardization_enabled");
private static final ConstructingObjectParser<OutlierDetection, Void> LENIENT_PARSER = createParser(true);
private static final ConstructingObjectParser<OutlierDetection, Void> STRICT_PARSER = createParser(false);
private static final ObjectParser<Builder, Void> LENIENT_PARSER = createParser(true);
private static final ObjectParser<Builder, Void> STRICT_PARSER = createParser(false);
private static ConstructingObjectParser<OutlierDetection, Void> createParser(boolean lenient) {
ConstructingObjectParser<OutlierDetection, Void> parser = new ConstructingObjectParser<>(NAME.getPreferredName(), lenient,
a -> new OutlierDetection((Integer) a[0], (Method) a[1], (Double) a[2]));
parser.declareInt(ConstructingObjectParser.optionalConstructorArg(), N_NEIGHBORS);
parser.declareField(ConstructingObjectParser.optionalConstructorArg(), p -> {
private static ObjectParser<Builder, Void> createParser(boolean lenient) {
ObjectParser<Builder, Void> parser = new ObjectParser<>(NAME.getPreferredName(), lenient, Builder::new);
parser.declareInt(Builder::setNNeighbors, N_NEIGHBORS);
parser.declareField(Builder::setMethod, p -> {
if (p.currentToken() == XContentParser.Token.VALUE_STRING) {
return Method.fromString(p.text());
}
throw new IllegalArgumentException("Unsupported token [" + p.currentToken() + "]");
}, METHOD, ObjectParser.ValueType.STRING);
parser.declareDouble(ConstructingObjectParser.optionalConstructorArg(), FEATURE_INFLUENCE_THRESHOLD);
parser.declareDouble(Builder::setFeatureInfluenceThreshold, FEATURE_INFLUENCE_THRESHOLD);
parser.declareBoolean(Builder::setComputeFeatureInfluence, COMPUTE_FEATURE_INFLUENCE);
parser.declareDouble(Builder::setOutlierFraction, OUTLIER_FRACTION);
parser.declareBoolean(Builder::setStandardizationEnabled, STANDARDIZATION_ENABLED);
return parser;
}
public static OutlierDetection fromXContent(XContentParser parser, boolean ignoreUnknownFields) {
return ignoreUnknownFields ? LENIENT_PARSER.apply(parser, null) : STRICT_PARSER.apply(parser, null);
return ignoreUnknownFields ? LENIENT_PARSER.apply(parser, null).build() : STRICT_PARSER.apply(parser, null).build();
}
/**
* The number of neighbors. Leave unspecified for dynamic detection.
*/
@Nullable
private final Integer nNeighbors;
/**
* The method. Leave unspecified for a dynamic mixture of methods.
*/
@Nullable
private final Method method;
/**
* The min outlier score required to calculate feature influence. Defaults to 0.1.
*/
@Nullable
private final Double featureInfluenceThreshold;
/**
* Constructs the outlier detection configuration
* @param nNeighbors The number of neighbors. Leave unspecified for dynamic detection.
* @param method The method. Leave unspecified for a dynamic mixture of methods.
* @param featureInfluenceThreshold The min outlier score required to calculate feature influence. Defaults to 0.1.
* Whether to compute feature influence or not. Defaults to true.
*/
public OutlierDetection(@Nullable Integer nNeighbors, @Nullable Method method, @Nullable Double featureInfluenceThreshold) {
private final boolean computeFeatureInfluence;
/**
* The proportion of data assumed to be outlying prior to outlier detection. Defaults to 0.05.
*/
private final double outlierFraction;
/**
* Whether to perform standardization.
*/
private final boolean standardizationEnabled;
private OutlierDetection(Integer nNeighbors, Method method, Double featureInfluenceThreshold, boolean computeFeatureInfluence,
double outlierFraction, boolean standardizationEnabled) {
if (nNeighbors != null && nNeighbors <= 0) {
throw ExceptionsHelper.badRequestException("[{}] must be a positive integer", N_NEIGHBORS.getPreferredName());
}
@ -71,22 +100,31 @@ public class OutlierDetection implements DataFrameAnalysis {
throw ExceptionsHelper.badRequestException("[{}] must be in [0, 1]", FEATURE_INFLUENCE_THRESHOLD.getPreferredName());
}
if (outlierFraction < 0.0 || outlierFraction > 1.0) {
throw ExceptionsHelper.badRequestException("[{}] must be in [0, 1]", OUTLIER_FRACTION.getPreferredName());
}
this.nNeighbors = nNeighbors;
this.method = method;
this.featureInfluenceThreshold = featureInfluenceThreshold;
}
/**
* Constructs the default outlier detection configuration
*/
public OutlierDetection() {
this(null, null, null);
this.computeFeatureInfluence = computeFeatureInfluence;
this.outlierFraction = outlierFraction;
this.standardizationEnabled = standardizationEnabled;
}
public OutlierDetection(StreamInput in) throws IOException {
nNeighbors = in.readOptionalVInt();
method = in.readBoolean() ? in.readEnum(Method.class) : null;
featureInfluenceThreshold = in.readOptionalDouble();
if (in.getVersion().onOrAfter(Version.V_7_5_0)) {
computeFeatureInfluence = in.readBoolean();
outlierFraction = in.readDouble();
standardizationEnabled = in.readBoolean();
} else {
computeFeatureInfluence = true;
outlierFraction = 0.05;
standardizationEnabled = true;
}
}
@Override
@ -106,6 +144,12 @@ public class OutlierDetection implements DataFrameAnalysis {
}
out.writeOptionalDouble(featureInfluenceThreshold);
if (out.getVersion().onOrAfter(Version.V_7_5_0)) {
out.writeBoolean(computeFeatureInfluence);
out.writeDouble(outlierFraction);
out.writeBoolean(standardizationEnabled);
}
}
@Override
@ -120,6 +164,9 @@ public class OutlierDetection implements DataFrameAnalysis {
if (featureInfluenceThreshold != null) {
builder.field(FEATURE_INFLUENCE_THRESHOLD.getPreferredName(), featureInfluenceThreshold);
}
builder.field(COMPUTE_FEATURE_INFLUENCE.getPreferredName(), computeFeatureInfluence);
builder.field(OUTLIER_FRACTION.getPreferredName(), outlierFraction);
builder.field(STANDARDIZATION_ENABLED.getPreferredName(), standardizationEnabled);
builder.endObject();
return builder;
}
@ -131,12 +178,16 @@ public class OutlierDetection implements DataFrameAnalysis {
OutlierDetection that = (OutlierDetection) o;
return Objects.equals(nNeighbors, that.nNeighbors)
&& Objects.equals(method, that.method)
&& Objects.equals(featureInfluenceThreshold, that.featureInfluenceThreshold);
&& Objects.equals(featureInfluenceThreshold, that.featureInfluenceThreshold)
&& computeFeatureInfluence == that.computeFeatureInfluence
&& outlierFraction == that.outlierFraction
&& standardizationEnabled == that.standardizationEnabled;
}
@Override
public int hashCode() {
return Objects.hash(nNeighbors, method, featureInfluenceThreshold);
return Objects.hash(nNeighbors, method, featureInfluenceThreshold, computeFeatureInfluence, outlierFraction,
standardizationEnabled);
}
@Override
@ -151,6 +202,9 @@ public class OutlierDetection implements DataFrameAnalysis {
if (featureInfluenceThreshold != null) {
params.put(FEATURE_INFLUENCE_THRESHOLD.getPreferredName(), featureInfluenceThreshold);
}
params.put(COMPUTE_FEATURE_INFLUENCE.getPreferredName(), computeFeatureInfluence);
params.put(OUTLIER_FRACTION.getPreferredName(), outlierFraction);
params.put(STANDARDIZATION_ENABLED.getPreferredName(), standardizationEnabled);
return params;
}
@ -191,4 +245,49 @@ public class OutlierDetection implements DataFrameAnalysis {
return name().toLowerCase(Locale.ROOT);
}
}
public static class Builder {
private Integer nNeighbors;
private Method method;
private Double featureInfluenceThreshold;
private boolean computeFeatureInfluence = true;
private double outlierFraction = 0.05;
private boolean standardizationEnabled = true;
public Builder setNNeighbors(Integer nNeighbors) {
this.nNeighbors = nNeighbors;
return this;
}
public Builder setMethod(Method method) {
this.method = method;
return this;
}
public Builder setFeatureInfluenceThreshold(Double featureInfluenceThreshold) {
this.featureInfluenceThreshold = featureInfluenceThreshold;
return this;
}
public Builder setComputeFeatureInfluence(boolean computeFeatureInfluence) {
this.computeFeatureInfluence = computeFeatureInfluence;
return this;
}
public Builder setOutlierFraction(double outlierFraction) {
this.outlierFraction = outlierFraction;
return this;
}
public Builder setStandardizationEnabled(boolean standardizationEnabled) {
this.standardizationEnabled = standardizationEnabled;
return this;
}
public OutlierDetection build() {
return new OutlierDetection(nNeighbors, method, featureInfluenceThreshold, computeFeatureInfluence, outlierFraction,
standardizationEnabled);
}
}
}

View File

@ -32,7 +32,14 @@ public class OutlierDetectionTests extends AbstractSerializingTestCase<OutlierDe
Integer numberNeighbors = randomBoolean() ? null : randomIntBetween(1, 20);
OutlierDetection.Method method = randomBoolean() ? null : randomFrom(OutlierDetection.Method.values());
Double minScoreToWriteFeatureInfluence = randomBoolean() ? null : randomDoubleBetween(0.0, 1.0, true);
return new OutlierDetection(numberNeighbors, method, minScoreToWriteFeatureInfluence);
return new OutlierDetection.Builder()
.setNNeighbors(numberNeighbors)
.setMethod(method)
.setFeatureInfluenceThreshold(minScoreToWriteFeatureInfluence)
.setComputeFeatureInfluence(randomBoolean())
.setOutlierFraction(randomDoubleBetween(0.0, 1.0, true))
.setStandardizationEnabled(randomBoolean())
.build();
}
@Override
@ -41,20 +48,38 @@ public class OutlierDetectionTests extends AbstractSerializingTestCase<OutlierDe
}
public void testGetParams_GivenDefaults() {
OutlierDetection outlierDetection = new OutlierDetection();
assertThat(outlierDetection.getParams().isEmpty(), is(true));
OutlierDetection outlierDetection = new OutlierDetection.Builder().build();
Map<String, Object> params = outlierDetection.getParams();
assertThat(params.size(), equalTo(3));
assertThat(params.containsKey("compute_feature_influence"), is(true));
assertThat(params.get("compute_feature_influence"), is(true));
assertThat(params.containsKey("outlier_fraction"), is(true));
assertThat((double) params.get("outlier_fraction"), closeTo(0.05, 0.0001));
assertThat(params.containsKey("standardization_enabled"), is(true));
assertThat(params.get("standardization_enabled"), is(true));
}
public void testGetParams_GivenExplicitValues() {
OutlierDetection outlierDetection = new OutlierDetection(42, OutlierDetection.Method.LDOF, 0.42);
OutlierDetection outlierDetection = new OutlierDetection.Builder()
.setNNeighbors(42)
.setMethod(OutlierDetection.Method.LDOF)
.setFeatureInfluenceThreshold(0.42)
.setComputeFeatureInfluence(false)
.setOutlierFraction(0.9)
.setStandardizationEnabled(false)
.build();
Map<String, Object> params = outlierDetection.getParams();
assertThat(params.size(), equalTo(3));
assertThat(params.size(), equalTo(6));
assertThat(params.get(OutlierDetection.N_NEIGHBORS.getPreferredName()), equalTo(42));
assertThat(params.get(OutlierDetection.METHOD.getPreferredName()), equalTo(OutlierDetection.Method.LDOF));
assertThat((Double) params.get(OutlierDetection.FEATURE_INFLUENCE_THRESHOLD.getPreferredName()),
is(closeTo(0.42, 1E-9)));
assertThat(params.get(OutlierDetection.COMPUTE_FEATURE_INFLUENCE.getPreferredName()), is(false));
assertThat((Double) params.get(OutlierDetection.OUTLIER_FRACTION.getPreferredName()),
is(closeTo(0.9, 1E-9)));
assertThat(params.get(OutlierDetection.STANDARDIZATION_ENABLED.getPreferredName()), is(false));
}
public void testGetStateDocId() {

View File

@ -62,6 +62,12 @@ integTest.runner {
'ml/data_frame_analytics_crud/Test put config given missing analysis',
'ml/data_frame_analytics_crud/Test put config given empty analysis',
'ml/data_frame_analytics_crud/Test max model memory limit',
'ml/data_frame_analytics_crud/Test put outlier_detection given n_neighbors is negative',
'ml/data_frame_analytics_crud/Test put outlier_detection given n_neighbors is zero',
'ml/data_frame_analytics_crud/Test put outlier_detection given feature_influence_threshold is negative',
'ml/data_frame_analytics_crud/Test put outlier_detection given feature_influence_threshold is greater than one',
'ml/data_frame_analytics_crud/Test put outlier_detection given outlier_fraction is negative',
'ml/data_frame_analytics_crud/Test put outlier_detection given outlier_fraction is greater than one',
'ml/data_frame_analytics_crud/Test put regression given dependent_variable is not defined',
'ml/data_frame_analytics_crud/Test put regression given negative lambda',
'ml/data_frame_analytics_crud/Test put regression given negative gamma',

View File

@ -69,7 +69,8 @@ public class OutlierDetectionWithMissingFieldsIT extends MlNativeDataFrameAnalyt
}
String id = "test_outlier_detection_with_missing_fields";
DataFrameAnalyticsConfig config = buildAnalytics(id, sourceIndex, sourceIndex + "-results", null, new OutlierDetection());
DataFrameAnalyticsConfig config = buildAnalytics(id, sourceIndex, sourceIndex + "-results", null,
new OutlierDetection.Builder().build());
registerAnalytics(config);
putAnalytics(config);

View File

@ -72,7 +72,8 @@ public class RunDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsIntegTest
}
String id = "test_outlier_detection_with_few_docs";
DataFrameAnalyticsConfig config = buildAnalytics(id, sourceIndex, sourceIndex + "-results", null, new OutlierDetection());
DataFrameAnalyticsConfig config = buildAnalytics(id, sourceIndex, sourceIndex + "-results", null,
new OutlierDetection.Builder().build());
registerAnalytics(config);
putAnalytics(config);
@ -147,7 +148,8 @@ public class RunDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsIntegTest
}
String id = "test_outlier_detection_with_enough_docs_to_scroll";
DataFrameAnalyticsConfig config = buildAnalytics(id, sourceIndex, sourceIndex + "-results", "custom_ml", new OutlierDetection());
DataFrameAnalyticsConfig config = buildAnalytics(id, sourceIndex, sourceIndex + "-results", "custom_ml",
new OutlierDetection.Builder().build());
registerAnalytics(config);
putAnalytics(config);
@ -216,7 +218,8 @@ public class RunDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsIntegTest
}
String id = "test_outlier_detection_with_more_fields_than_docvalue_limit";
DataFrameAnalyticsConfig config = buildAnalytics(id, sourceIndex, sourceIndex + "-results", null, new OutlierDetection());
DataFrameAnalyticsConfig config = buildAnalytics(id, sourceIndex, sourceIndex + "-results", null,
new OutlierDetection.Builder().build());
registerAnalytics(config);
putAnalytics(config);
@ -279,7 +282,8 @@ public class RunDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsIntegTest
}
String id = "test_stop_outlier_detection_with_enough_docs_to_scroll";
DataFrameAnalyticsConfig config = buildAnalytics(id, sourceIndex, sourceIndex + "-results", "custom_ml", new OutlierDetection());
DataFrameAnalyticsConfig config = buildAnalytics(id, sourceIndex, sourceIndex + "-results", "custom_ml",
new OutlierDetection.Builder().build());
registerAnalytics(config);
putAnalytics(config);
@ -347,7 +351,7 @@ public class RunDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsIntegTest
.setId(id)
.setSource(new DataFrameAnalyticsSource(sourceIndex, null))
.setDest(new DataFrameAnalyticsDest(destIndex, null))
.setAnalysis(new OutlierDetection())
.setAnalysis(new OutlierDetection.Builder().build())
.build();
registerAnalytics(config);
putAnalytics(config);
@ -405,7 +409,7 @@ public class RunDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsIntegTest
}
String id = "test_outlier_detection_with_pre_existing_dest_index";
DataFrameAnalyticsConfig config = buildAnalytics(id, sourceIndex, destIndex, null, new OutlierDetection());
DataFrameAnalyticsConfig config = buildAnalytics(id, sourceIndex, destIndex, null, new OutlierDetection.Builder().build());
registerAnalytics(config);
putAnalytics(config);
@ -461,7 +465,7 @@ public class RunDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsIntegTest
.setId(id)
.setSource(new DataFrameAnalyticsSource(new String[] { sourceIndex }, null))
.setDest(new DataFrameAnalyticsDest(sourceIndex + "-results", null))
.setAnalysis(new OutlierDetection())
.setAnalysis(new OutlierDetection.Builder().build())
.setModelMemoryLimit(modelMemoryLimit)
.build();
@ -503,7 +507,8 @@ public class RunDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsIntegTest
}
String id = "test_outlier_detection_stop_and_restart";
DataFrameAnalyticsConfig config = buildAnalytics(id, sourceIndex, sourceIndex + "-results", "custom_ml", new OutlierDetection());
DataFrameAnalyticsConfig config = buildAnalytics(id, sourceIndex, sourceIndex + "-results", "custom_ml",
new OutlierDetection.Builder().build());
registerAnalytics(config);
putAnalytics(config);
@ -545,4 +550,92 @@ public class RunDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsIntegTest
assertProgress(id, 100, 100, 100, 100);
assertThat(searchStoredProgress(id).getHits().getTotalHits().value, equalTo(1L));
}
public void testOutlierDetectionWithCustomParams() throws Exception {
String sourceIndex = "test-outlier-detection-with-custom-params";
client().admin().indices().prepareCreate(sourceIndex)
.addMapping("_doc", "numeric_1", "type=double", "numeric_2", "type=float", "categorical_1", "type=keyword")
.get();
BulkRequestBuilder bulkRequestBuilder = client().prepareBulk();
bulkRequestBuilder.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
for (int i = 0; i < 5; i++) {
IndexRequest indexRequest = new IndexRequest(sourceIndex);
// We insert one odd value out of 5 for one feature
String docId = i == 0 ? "outlier" : "normal" + i;
indexRequest.id(docId);
indexRequest.source("numeric_1", i == 0 ? 100.0 : 1.0, "numeric_2", 1.0, "categorical_1", "foo_" + i);
bulkRequestBuilder.add(indexRequest);
}
BulkResponse bulkResponse = bulkRequestBuilder.get();
if (bulkResponse.hasFailures()) {
fail("Failed to index data: " + bulkResponse.buildFailureMessage());
}
String id = "test_outlier_detection_with_custom_params";
DataFrameAnalyticsConfig config = buildAnalytics(id, sourceIndex, sourceIndex + "-results", null,
new OutlierDetection.Builder()
.setNNeighbors(3)
.setMethod(OutlierDetection.Method.DISTANCE_KNN)
.setFeatureInfluenceThreshold(0.01)
.setComputeFeatureInfluence(false)
.setOutlierFraction(0.04)
.setStandardizationEnabled(true)
.build());
registerAnalytics(config);
putAnalytics(config);
assertState(id, DataFrameAnalyticsState.STOPPED);
assertProgress(id, 0, 0, 0, 0);
startAnalytics(id);
waitUntilAnalyticsIsStopped(id);
SearchResponse sourceData = client().prepareSearch(sourceIndex).get();
double scoreOfOutlier = 0.0;
double scoreOfNonOutlier = -1.0;
for (SearchHit hit : sourceData.getHits()) {
GetResponse destDocGetResponse = client().prepareGet().setIndex(config.getDest().getIndex()).setId(hit.getId()).get();
assertThat(destDocGetResponse.isExists(), is(true));
Map<String, Object> sourceDoc = hit.getSourceAsMap();
Map<String, Object> destDoc = destDocGetResponse.getSource();
for (String field : sourceDoc.keySet()) {
assertThat(destDoc.containsKey(field), is(true));
assertThat(destDoc.get(field), equalTo(sourceDoc.get(field)));
}
assertThat(destDoc.containsKey("ml"), is(true));
@SuppressWarnings("unchecked")
Map<String, Object> resultsObject = (Map<String, Object>) destDoc.get("ml");
assertThat(resultsObject.containsKey("outlier_score"), is(true));
assertThat(resultsObject.containsKey("feature_influence"), is(false));
double outlierScore = (double) resultsObject.get("outlier_score");
assertThat(outlierScore, allOf(greaterThanOrEqualTo(0.0), lessThanOrEqualTo(1.0)));
if (hit.getId().equals("outlier")) {
scoreOfOutlier = outlierScore;
} else {
if (scoreOfNonOutlier < 0) {
scoreOfNonOutlier = outlierScore;
} else {
assertThat(outlierScore, equalTo(scoreOfNonOutlier));
}
}
}
assertThat(scoreOfOutlier, is(greaterThan(scoreOfNonOutlier)));
assertProgress(id, 100, 100, 100, 100);
assertThat(searchStoredProgress(id).getHits().getTotalHits().value, equalTo(1L));
assertThatAuditMessagesMatch(id,
"Created analytics with analysis type [outlier_detection]",
"Estimated memory usage for this analytics to be",
"Started analytics",
"Creating destination index [test-outlier-detection-with-custom-params-results]",
"Finished reindexing to destination index [test-outlier-detection-with-custom-params-results]",
"Finished analysis");
}
}

View File

@ -62,7 +62,7 @@ public class DataFrameAnalyticsIndexTests extends ESTestCase {
.setId(ANALYTICS_ID)
.setSource(new DataFrameAnalyticsSource(SOURCE_INDEX, null))
.setDest(new DataFrameAnalyticsDest(DEST_INDEX, null))
.setAnalysis(new OutlierDetection())
.setAnalysis(new OutlierDetection.Builder().build())
.build();
private static final int CURRENT_TIME_MILLIS = 123456789;
private static final String CREATED_BY = "data-frame-analytics";

View File

@ -67,7 +67,7 @@ public class SourceDestValidatorTests extends ESTestCase {
.setId("test")
.setSource(createSource("source-1"))
.setDest(new DataFrameAnalyticsDest("dest", null))
.setAnalysis(new OutlierDetection())
.setAnalysis(new OutlierDetection.Builder().build())
.build();
SourceDestValidator validator = new SourceDestValidator(CLUSTER_STATE, new IndexNameExpressionResolver());
@ -79,7 +79,7 @@ public class SourceDestValidatorTests extends ESTestCase {
.setId("test")
.setSource(createSource("missing"))
.setDest(new DataFrameAnalyticsDest("dest", null))
.setAnalysis(new OutlierDetection())
.setAnalysis(new OutlierDetection.Builder().build())
.build();
SourceDestValidator validator = new SourceDestValidator(CLUSTER_STATE, new IndexNameExpressionResolver());
@ -94,7 +94,7 @@ public class SourceDestValidatorTests extends ESTestCase {
.setId("test")
.setSource(createSource("missing*"))
.setDest(new DataFrameAnalyticsDest("dest", null))
.setAnalysis(new OutlierDetection())
.setAnalysis(new OutlierDetection.Builder().build())
.build();
SourceDestValidator validator = new SourceDestValidator(CLUSTER_STATE, new IndexNameExpressionResolver());
@ -109,7 +109,7 @@ public class SourceDestValidatorTests extends ESTestCase {
.setId("test")
.setSource(createSource("source-1"))
.setDest(new DataFrameAnalyticsDest("source-1", null))
.setAnalysis(new OutlierDetection())
.setAnalysis(new OutlierDetection.Builder().build())
.build();
SourceDestValidator validator = new SourceDestValidator(CLUSTER_STATE, new IndexNameExpressionResolver());
@ -124,7 +124,7 @@ public class SourceDestValidatorTests extends ESTestCase {
.setId("test")
.setSource(createSource("source-*"))
.setDest(new DataFrameAnalyticsDest(SOURCE_2, null))
.setAnalysis(new OutlierDetection())
.setAnalysis(new OutlierDetection.Builder().build())
.build();
SourceDestValidator validator = new SourceDestValidator(CLUSTER_STATE, new IndexNameExpressionResolver());
@ -139,7 +139,7 @@ public class SourceDestValidatorTests extends ESTestCase {
.setId("test")
.setSource(createSource("source-1,source-*"))
.setDest(new DataFrameAnalyticsDest(SOURCE_2, null))
.setAnalysis(new OutlierDetection())
.setAnalysis(new OutlierDetection.Builder().build())
.build();
SourceDestValidator validator = new SourceDestValidator(CLUSTER_STATE, new IndexNameExpressionResolver());
@ -154,7 +154,7 @@ public class SourceDestValidatorTests extends ESTestCase {
.setId("test")
.setSource(createSource(SOURCE_1))
.setDest(new DataFrameAnalyticsDest("dest-alias", null))
.setAnalysis(new OutlierDetection())
.setAnalysis(new OutlierDetection.Builder().build())
.build();
SourceDestValidator validator = new SourceDestValidator(CLUSTER_STATE, new IndexNameExpressionResolver());
@ -170,7 +170,7 @@ public class SourceDestValidatorTests extends ESTestCase {
.setId("test")
.setSource(createSource("source-1"))
.setDest(new DataFrameAnalyticsDest("source-1-alias", null))
.setAnalysis(new OutlierDetection())
.setAnalysis(new OutlierDetection.Builder().build())
.build();
SourceDestValidator validator = new SourceDestValidator(CLUSTER_STATE, new IndexNameExpressionResolver());

View File

@ -508,7 +508,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
.setSource(new DataFrameAnalyticsSource(SOURCE_INDEX, null))
.setDest(new DataFrameAnalyticsDest(DEST_INDEX, RESULTS_FIELD))
.setAnalyzedFields(analyzedFields)
.setAnalysis(new OutlierDetection())
.setAnalysis(new OutlierDetection.Builder().build())
.build();
}

View File

@ -53,7 +53,13 @@ setup:
- match: { source.index: ["index-source"] }
- match: { source.query: {"term" : { "user" : "Kimchy"} } }
- match: { dest.index: "index-dest" }
- match: { analysis: {"outlier_detection":{}} }
- match: { analysis: {
"outlier_detection":{
"compute_feature_influence": true,
"outlier_fraction": 0.05,
"standardization_enabled": true
}
}}
- match: { analyzed_fields: {"includes" : ["obj1.*", "obj2.*" ], "excludes": [] } }
- is_true: create_time
- is_true: version
@ -66,7 +72,13 @@ setup:
- match: { data_frame_analytics.0.source.index: ["index-source"] }
- match: { data_frame_analytics.0.source.query: {"term" : { "user" : "Kimchy"} } }
- match: { data_frame_analytics.0.dest.index: "index-dest" }
- match: { data_frame_analytics.0.analysis: {"outlier_detection":{}} }
- match: { data_frame_analytics.0.analysis: {
"outlier_detection":{
"compute_feature_influence": true,
"outlier_fraction": 0.05,
"standardization_enabled": true
}
}}
- match: { data_frame_analytics.0.analyzed_fields: {"includes" : ["obj1.*", "obj2.*" ], "excludes": [] } }
- is_true: data_frame_analytics.0.create_time
- is_true: data_frame_analytics.0.version
@ -148,7 +160,13 @@ setup:
- match: { source.index: ["index-source"] }
- match: { source.query: {"match_all" : {} } }
- match: { dest.index: "index-dest" }
- match: { analysis: {"outlier_detection":{}} }
- match: { analysis: {
"outlier_detection":{
"compute_feature_influence": true,
"outlier_fraction": 0.05,
"standardization_enabled": true
}
}}
- is_true: create_time
- is_true: version
@ -170,7 +188,10 @@ setup:
"outlier_detection":{
"n_neighbors": 5,
"method": "lof",
"feature_influence_threshold": 0.0
"feature_influence_threshold": 0.0,
"compute_feature_influence": false,
"outlier_fraction": 0.95,
"standardization_enabled": false
}
}
}
@ -178,9 +199,16 @@ setup:
- match: { source.index: ["index-source"] }
- match: { source.query: {"match_all" : {} } }
- match: { dest.index: "index-dest" }
- match: { analysis.outlier_detection.n_neighbors: 5 }
- match: { analysis.outlier_detection.method: "lof" }
- match: { analysis.outlier_detection.feature_influence_threshold: 0.0 }
- match: { analysis: {
"outlier_detection":{
"n_neighbors": 5,
"method": "lof",
"feature_influence_threshold": 0.0,
"compute_feature_influence": false,
"outlier_fraction": 0.95,
"standardization_enabled": false
}
}}
- is_true: create_time
- is_true: version
@ -924,7 +952,13 @@ setup:
- match: { source.index: ["index-source"] }
- match: { source.query: {"term" : { "user" : "Kimchy"} } }
- match: { dest.index: "index-dest" }
- match: { analysis: {"outlier_detection":{}} }
- match: { analysis: {
"outlier_detection":{
"compute_feature_influence": true,
"outlier_fraction": 0.05,
"standardization_enabled": true
}
}}
- match: { analyzed_fields: {"includes" : ["obj1.*", "obj2.*" ], "excludes": [] } }
- match: { model_memory_limit: "20mb" }
@ -938,6 +972,138 @@ setup:
xpack.ml.max_model_memory_limit: null
- match: {transient: {}}
---
"Test put outlier_detection given n_neighbors is negative":
- do:
catch: /\[n_neighbors\] must be a positive integer/
ml.put_data_frame_analytics:
id: "outlier_detection-with-negative-n_neighbors"
body: >
{
"source": {
"index": "index-source"
},
"dest": {
"index": "index-dest"
},
"analysis": {
"outlier_detection": {
"n_neighbors": -1
}
}
}
---
"Test put outlier_detection given n_neighbors is zero":
- do:
catch: /\[n_neighbors\] must be a positive integer/
ml.put_data_frame_analytics:
id: "outlier_detection-with-zero-n_neighbors"
body: >
{
"source": {
"index": "index-source"
},
"dest": {
"index": "index-dest"
},
"analysis": {
"outlier_detection": {
"n_neighbors": 0
}
}
}
---
"Test put outlier_detection given feature_influence_threshold is negative":
- do:
catch: /\[feature_influence_threshold\] must be in \[0, 1\]/
ml.put_data_frame_analytics:
id: "outlier_detection-with-negative-feature_influence_threshold"
body: >
{
"source": {
"index": "index-source"
},
"dest": {
"index": "index-dest"
},
"analysis": {
"outlier_detection": {
"feature_influence_threshold": -0.001
}
}
}
---
"Test put outlier_detection given feature_influence_threshold is greater than one":
- do:
catch: /\[feature_influence_threshold\] must be in \[0, 1\]/
ml.put_data_frame_analytics:
id: "outlier_detection-with-negative-feature_influence_threshold"
body: >
{
"source": {
"index": "index-source"
},
"dest": {
"index": "index-dest"
},
"analysis": {
"outlier_detection": {
"feature_influence_threshold": 1.001
}
}
}
---
"Test put outlier_detection given outlier_fraction is negative":
- do:
catch: /\[outlier_fraction\] must be in \[0, 1\]/
ml.put_data_frame_analytics:
id: "outlier_detection-with-negative-outlier_fraction"
body: >
{
"source": {
"index": "index-source"
},
"dest": {
"index": "index-dest"
},
"analysis": {
"outlier_detection": {
"outlier_fraction": -0.001
}
}
}
---
"Test put outlier_detection given outlier_fraction is greater than one":
- do:
catch: /\[outlier_fraction\] must be in \[0, 1\]/
ml.put_data_frame_analytics:
id: "outlier_detection-with-negative-outlier_fraction"
body: >
{
"source": {
"index": "index-source"
},
"dest": {
"index": "index-dest"
},
"analysis": {
"outlier_detection": {
"outlier_fraction": 1.001
}
}
}
---
"Test put regression given dependent_variable is not defined":