[ML] Add more categorization validations (elastic/x-pack-elasticsearch#1019)

- validates that when mlcategory is used, categorization_field_name is
set
- validates that when categorization_field_name is set, mlcategory is
used

relates elastic/x-pack-elasticsearch#986

Original commit: elastic/x-pack-elasticsearch@e861a3ed58
This commit is contained in:
Dimitris Athanasiou 2017-04-09 18:18:17 +01:00 committed by GitHub
parent 0df726f6dd
commit a1cb22836c
6 changed files with 143 additions and 47 deletions

View File

@ -263,11 +263,7 @@ public class AnalysisConfig extends ToXContentToBytes implements Writeable {
public Set<String> termFields() {
Set<String> termFields = new TreeSet<>();
for (Detector d : getDetectors()) {
addIfNotNull(termFields, d.getByFieldName());
addIfNotNull(termFields, d.getOverFieldName());
addIfNotNull(termFields, d.getPartitionFieldName());
}
getDetectors().stream().forEach(d -> termFields.addAll(d.getByOverPartitionTerms()));
for (String i : getInfluencers()) {
addIfNotNull(termFields, i);
@ -561,11 +557,12 @@ public class AnalysisConfig extends ToXContentToBytes implements Writeable {
}
checkFieldIsNotNegativeIfSpecified(PERIOD.getPreferredName(), period);
verifyDetectorAreDefined(detectors);
verifyDetectorAreDefined();
verifyFieldName(summaryCountFieldName);
verifyFieldName(categorizationFieldName);
verifyCategorizationFilters(categorizationFilters, categorizationFieldName);
verifyMlCategoryIsUsedWhenCategorizationFieldNameIsSet();
verifyCategorizationFilters();
checkFieldIsNotNegativeIfSpecified(RESULT_FINALIZATION_WINDOW.getPreferredName(), resultFinalizationWindow);
verifyMultipleBucketSpans();
@ -588,44 +585,58 @@ public class AnalysisConfig extends ToXContentToBytes implements Writeable {
}
}
private static void verifyDetectorAreDefined(List<Detector> detectors) {
private void verifyDetectorAreDefined() {
if (detectors == null || detectors.isEmpty()) {
throw new IllegalArgumentException(Messages.getMessage(Messages.JOB_CONFIG_NO_DETECTORS));
}
}
private static void verifyCategorizationFilters(List<String> filters, String categorizationFieldName) {
if (filters == null || filters.isEmpty()) {
private void verifyMlCategoryIsUsedWhenCategorizationFieldNameIsSet() {
Set<String> byOverPartitionFields = new TreeSet<>();
detectors.stream().forEach(d -> byOverPartitionFields.addAll(d.getByOverPartitionTerms()));
boolean isMlCategoryUsed = byOverPartitionFields.contains(ML_CATEGORY_FIELD);
if (isMlCategoryUsed && categorizationFieldName == null) {
throw new IllegalArgumentException(CATEGORIZATION_FIELD_NAME.getPreferredName()
+ " must be set for " + ML_CATEGORY_FIELD + " to be available");
}
if (categorizationFieldName != null && isMlCategoryUsed == false) {
throw new IllegalArgumentException(CATEGORIZATION_FIELD_NAME.getPreferredName()
+ " is set but " + ML_CATEGORY_FIELD + " is not used in any detector by/over/partition field");
}
}
private void verifyCategorizationFilters() {
if (categorizationFilters == null || categorizationFilters.isEmpty()) {
return;
}
verifyCategorizationFieldNameSetIfFiltersAreSet(categorizationFieldName);
verifyCategorizationFiltersAreDistinct(filters);
verifyCategorizationFiltersContainNoneEmpty(filters);
verifyCategorizationFiltersAreValidRegex(filters);
verifyCategorizationFieldNameSetIfFiltersAreSet();
verifyCategorizationFiltersAreDistinct();
verifyCategorizationFiltersContainNoneEmpty();
verifyCategorizationFiltersAreValidRegex();
}
private static void verifyCategorizationFieldNameSetIfFiltersAreSet(String categorizationFieldName) {
private void verifyCategorizationFieldNameSetIfFiltersAreSet() {
if (categorizationFieldName == null) {
throw new IllegalArgumentException(Messages.getMessage(
Messages.JOB_CONFIG_CATEGORIZATION_FILTERS_REQUIRE_CATEGORIZATION_FIELD_NAME));
}
}
private static void verifyCategorizationFiltersAreDistinct(List<String> filters) {
if (filters.stream().distinct().count() != filters.size()) {
private void verifyCategorizationFiltersAreDistinct() {
if (categorizationFilters.stream().distinct().count() != categorizationFilters.size()) {
throw new IllegalArgumentException(Messages.getMessage(Messages.JOB_CONFIG_CATEGORIZATION_FILTERS_CONTAINS_DUPLICATES));
}
}
private static void verifyCategorizationFiltersContainNoneEmpty(List<String> filters) {
if (filters.stream().anyMatch(f -> f.isEmpty())) {
private void verifyCategorizationFiltersContainNoneEmpty() {
if (categorizationFilters.stream().anyMatch(String::isEmpty)) {
throw new IllegalArgumentException(Messages.getMessage(Messages.JOB_CONFIG_CATEGORIZATION_FILTERS_CONTAINS_EMPTY));
}
}
private static void verifyCategorizationFiltersAreValidRegex(List<String> filters) {
for (String filter : filters) {
private void verifyCategorizationFiltersAreValidRegex() {
for (String filter : categorizationFilters) {
if (!isValidRegex(filter)) {
throw new IllegalArgumentException(
Messages.getMessage(Messages.JOB_CONFIG_CATEGORIZATION_FILTERS_CONTAINS_INVALID_REGEX, filter));

View File

@ -475,6 +475,23 @@ public class Detector extends ToXContentToBytes implements Writeable {
.flatMap(Set::stream).collect(Collectors.toSet());
}
/**
* Returns the set of by/over/partition terms
*/
public Set<String> getByOverPartitionTerms() {
Set<String> terms = new HashSet<>();
if (byFieldName != null) {
terms.add(byFieldName);
}
if (overFieldName != null) {
terms.add(overFieldName);
}
if (partitionFieldName != null) {
terms.add(partitionFieldName);
}
return terms;
}
@Override
public boolean equals(Object other) {
if (this == other) {

View File

@ -20,6 +20,8 @@ import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import static org.hamcrest.Matchers.equalTo;
public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisConfig> {
@Override
@ -28,11 +30,12 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
}
public static AnalysisConfig.Builder createRandomized() {
boolean isCategorization = randomBoolean();
List<Detector> detectors = new ArrayList<>();
int numDetectors = randomIntBetween(1, 10);
for (int i = 0; i < numDetectors; i++) {
Detector.Builder builder = new Detector.Builder("count", null);
builder.setPartitionFieldName("part");
builder.setPartitionFieldName(isCategorization ? "mlcategory" : "part");
detectors.add(builder.build());
}
AnalysisConfig.Builder builder = new AnalysisConfig.Builder(detectors);
@ -45,7 +48,7 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
bucketSpan = TimeValue.timeValueSeconds(randomIntBetween(1, 1_000_000));
builder.setBucketSpan(bucketSpan);
}
if (randomBoolean()) {
if (isCategorization) {
builder.setCategorizationFieldName(randomAlphaOfLength(10));
builder.setCategorizationFilters(Arrays.asList(generateRandomStringArray(10, 10, false)));
}
@ -234,6 +237,60 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
assertTrue(ac.getMultipleBucketSpans().contains(TimeValue.timeValueSeconds(24000)));
}
public void testBuild_GivenMlCategoryUsedAsByFieldButNoCategorizationFieldName() {
Detector.Builder detector = new Detector.Builder();
detector.setFunction("count");
detector.setByFieldName("mlcategory");
AnalysisConfig.Builder ac = new AnalysisConfig.Builder(Arrays.asList(detector.build()));
ac.setCategorizationFieldName(null);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, ac::build);
assertThat(e.getMessage(), equalTo("categorization_field_name must be set for mlcategory to be available"));
}
public void testBuild_GivenMlCategoryUsedAsOverFieldButNoCategorizationFieldName() {
Detector.Builder detector = new Detector.Builder();
detector.setFunction("count");
detector.setOverFieldName("mlcategory");
AnalysisConfig.Builder ac = new AnalysisConfig.Builder(Arrays.asList(detector.build()));
ac.setCategorizationFieldName(null);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, ac::build);
assertThat(e.getMessage(), equalTo("categorization_field_name must be set for mlcategory to be available"));
}
public void testBuild_GivenMlCategoryUsedAsPartitionFieldButNoCategorizationFieldName() {
Detector.Builder detector = new Detector.Builder();
detector.setFunction("count");
detector.setPartitionFieldName("mlcategory");
AnalysisConfig.Builder ac = new AnalysisConfig.Builder(Arrays.asList(detector.build()));
ac.setCategorizationFieldName(null);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, ac::build);
assertThat(e.getMessage(), equalTo("categorization_field_name must be set for mlcategory to be available"));
}
public void testBuild_GivenCategorizationFieldNameButNoUseOfMlCategory() {
Detector.Builder detector = new Detector.Builder();
detector.setFunction("count");
detector.setOverFieldName("foo");
AnalysisConfig.Builder ac = new AnalysisConfig.Builder(Arrays.asList(detector.build()));
ac.setCategorizationFieldName("msg");
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, ac::build);
assertThat(e.getMessage(), equalTo("categorization_field_name is set but mlcategory is " +
"not used in any detector by/over/partition field"));
}
public void testBuild_GivenMlCategoryUsedAsByFieldAndCategorizationFieldName() {
Detector.Builder detector = new Detector.Builder();
detector.setFunction("count");
detector.setOverFieldName("mlcategory");
AnalysisConfig.Builder ac = new AnalysisConfig.Builder(Arrays.asList(detector.build()));
ac.setCategorizationFieldName("msg");
ac.build();
}
public void testEquals_GivenSameReference() {
AnalysisConfig config = createFullyPopulatedConfig();
assertTrue(config.equals(config));
@ -283,11 +340,11 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
}
public void testEquals_GivenCategorizationField() {
AnalysisConfig.Builder builder = createConfigBuilder();
AnalysisConfig.Builder builder = createValidCategorizationConfig();
builder.setCategorizationFieldName("foo");
AnalysisConfig config1 = builder.build();
builder = createConfigBuilder();
builder = createValidCategorizationConfig();
builder.setCategorizationFieldName("bar");
AnalysisConfig config2 = builder.build();
@ -370,11 +427,12 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
}
public void testEquals_GivenDifferentCategorizationFilters() {
AnalysisConfig config1 = createFullyPopulatedConfig();
AnalysisConfig.Builder builder = createConfigBuilder();
builder.setCategorizationFilters(Arrays.asList("foo", "bar"));
builder.setCategorizationFieldName("cat");
AnalysisConfig config2 = builder.build();
AnalysisConfig.Builder configBuilder1 = createValidCategorizationConfig();
AnalysisConfig.Builder configBuilder2 = createValidCategorizationConfig();
configBuilder1.setCategorizationFilters(Arrays.asList("foo", "bar"));
configBuilder2.setCategorizationFilters(Arrays.asList("foo", "foobar"));
AnalysisConfig config1 = configBuilder1.build();
AnalysisConfig config2 = configBuilder2.build();
assertFalse(config1.equals(config2));
assertFalse(config2.equals(config1));
@ -398,8 +456,10 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
}
private static AnalysisConfig createFullyPopulatedConfig() {
Detector.Builder detector = new Detector.Builder("min", "count");
detector.setOverFieldName("mlcategory");
AnalysisConfig.Builder builder = new AnalysisConfig.Builder(
Collections.singletonList(new Detector.Builder("min", "count").build()));
Collections.singletonList(detector.build()));
builder.setBucketSpan(TimeValue.timeValueHours(1));
builder.setBatchSpan(TimeValue.timeValueHours(24));
builder.setCategorizationFieldName("cat");
@ -508,8 +568,7 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
}
public void testVerify_GivenValidConfigWithCategorizationFieldNameAndCategorizationFilters() {
AnalysisConfig.Builder analysisConfig = createValidConfig();
analysisConfig.setCategorizationFieldName("myCategory");
AnalysisConfig.Builder analysisConfig = createValidCategorizationConfig();
analysisConfig.setCategorizationFilters(Arrays.asList("foo", "bar"));
analysisConfig.build();
@ -668,8 +727,7 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
}
public void testVerify_GivenDuplicateCategorizationFilters() {
AnalysisConfig.Builder config = createValidConfig();
config.setCategorizationFieldName("myCategory");
AnalysisConfig.Builder config = createValidCategorizationConfig();
config.setCategorizationFilters(Arrays.asList("foo", "bar", "foo"));
IllegalArgumentException e = ESTestCase.expectThrows(IllegalArgumentException.class, () -> config.build());
@ -678,8 +736,7 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
}
public void testVerify_GivenEmptyCategorizationFilter() {
AnalysisConfig.Builder config = createValidConfig();
config.setCategorizationFieldName("myCategory");
AnalysisConfig.Builder config = createValidCategorizationConfig();
config.setCategorizationFilters(Arrays.asList("foo", ""));
IllegalArgumentException e = ESTestCase.expectThrows(IllegalArgumentException.class, () -> config.build());
@ -722,9 +779,7 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
}
public void testVerify_GivenCategorizationFiltersContainInvalidRegex() {
AnalysisConfig.Builder config = createValidConfig();
config.setCategorizationFieldName("myCategory");
AnalysisConfig.Builder config = createValidCategorizationConfig();
config.setCategorizationFilters(Arrays.asList("foo", "("));
IllegalArgumentException e = ESTestCase.expectThrows(IllegalArgumentException.class, () -> config.build());
@ -743,4 +798,16 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
analysisConfig.setPeriod(0L);
return analysisConfig;
}
private static AnalysisConfig.Builder createValidCategorizationConfig() {
Detector.Builder detector = new Detector.Builder("count", null);
detector.setByFieldName("mlcategory");
AnalysisConfig.Builder analysisConfig = new AnalysisConfig.Builder(Collections.singletonList(detector.build()));
analysisConfig.setBucketSpan(TimeValue.timeValueHours(1));
analysisConfig.setBatchSpan(TimeValue.timeValueHours(2));
analysisConfig.setLatency(TimeValue.ZERO);
analysisConfig.setPeriod(0L);
analysisConfig.setCategorizationFieldName("msg");
return analysisConfig;
}
}

View File

@ -90,7 +90,7 @@ public class JobUpdateTests extends AbstractSerializingTestCase<JobUpdate> {
public void testMergeWithJob() {
List<JobUpdate.DetectorUpdate> detectorUpdates = new ArrayList<>();
List<DetectionRule> detectionRules1 = Collections.singletonList(new DetectionRule("client", null, Connective.OR,
List<DetectionRule> detectionRules1 = Collections.singletonList(new DetectionRule("mlcategory", null, Connective.OR,
Collections.singletonList(
new RuleCondition(RuleConditionType.NUMERICAL_ACTUAL, null, null, new Condition(Operator.GT, "5"), null))));
detectorUpdates.add(new JobUpdate.DetectorUpdate(0, "description-1", detectionRules1));
@ -120,7 +120,7 @@ public class JobUpdateTests extends AbstractSerializingTestCase<JobUpdate> {
Job.Builder jobBuilder = new Job.Builder("foo");
Detector.Builder d1 = new Detector.Builder("info_content", "domain");
d1.setOverFieldName("client");
d1.setOverFieldName("mlcategory");
Detector.Builder d2 = new Detector.Builder("min", "field");
d2.setOverFieldName("host");
AnalysisConfig.Builder ac = new AnalysisConfig.Builder(Arrays.asList(d1.build(), d2.build()));

View File

@ -119,7 +119,7 @@ public class FieldConfigWriterTests extends ESTestCase {
public void testWrite_GivenConfigHasCategorizationField() throws IOException {
Detector.Builder d = new Detector.Builder("metric", "Integer_Value");
d.setByFieldName("ts_hash");
d.setByFieldName("mlcategory");
AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(d.build()));
builder.setCategorizationFieldName("foo");
@ -128,7 +128,7 @@ public class FieldConfigWriterTests extends ESTestCase {
createFieldConfigWriter().write();
verify(writer).write("detector.0.clause = metric(Integer_Value) by ts_hash categorizationfield=foo\n");
verify(writer).write("detector.0.clause = metric(Integer_Value) by mlcategory categorizationfield=foo\n");
verifyNoMoreInteractions(writer);
}
@ -153,7 +153,7 @@ public class FieldConfigWriterTests extends ESTestCase {
public void testWrite_GivenConfigHasCategorizationFieldAndFiltersAndInfluencer() throws IOException {
Detector.Builder d = new Detector.Builder("metric", "Integer_Value");
d.setByFieldName("ts_hash");
d.setByFieldName("mlcategory");
AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(d.build()));
builder.setInfluencers(Arrays.asList("sun"));
@ -166,7 +166,7 @@ public class FieldConfigWriterTests extends ESTestCase {
createFieldConfigWriter().write();
verify(writer).write(
"detector.0.clause = metric(Integer_Value) by ts_hash categorizationfield=myCategory\n" +
"detector.0.clause = metric(Integer_Value) by mlcategory categorizationfield=myCategory\n" +
"categorizationfilter.0 = foo\n" +
"categorizationfilter.1 = \" \"\n" +
"categorizationfilter.2 = \"abc,def\"\n" +

View File

@ -182,7 +182,8 @@
{
"description":"Pre update description",
"analysis_config" : {
"detectors" :[{"function":"mean","field_name":"responsetime","by_field_name":"airline"}, {"function":"count"}],
"detectors" :[{"function":"mean","field_name":"responsetime","by_field_name":"airline"},
{"function":"count","by_field_name":"mlcategory"}],
"categorization_field_name": "some_category",
"categorization_filters" : ["cat1.*", "cat2.*"]
},