[7.x] [ML] add num_matches and preferred_to_categories to category defintion objects (#54214) (#54639)

* [ML] add num_matches and preferred_to_categories to category defintion objects (#54214)

This adds two new fields to category definitions.

- `num_matches` indicating how many documents have been seen by this category
- `preferred_to_categories` indicating which other categories this particular category supersedes when messages are categorized.

These fields are only guaranteed to be up to date after a `_flush` or `_close`

native change: https://github.com/elastic/ml-cpp/pull/1062

* adjusting for backport
This commit is contained in:
Benjamin Trent 2020-04-02 09:09:19 -04:00 committed by GitHub
parent 54ecb009bb
commit eb31be0e71
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 237 additions and 12 deletions

View File

@ -27,6 +27,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.Set;
@ -42,6 +43,8 @@ public class CategoryDefinition implements ToXContentObject {
public static final ParseField MAX_MATCHING_LENGTH = new ParseField("max_matching_length");
public static final ParseField EXAMPLES = new ParseField("examples");
public static final ParseField GROK_PATTERN = new ParseField("grok_pattern");
public static final ParseField NUM_MATCHES = new ParseField("num_matches");
public static final ParseField PREFERRED_TO_CATEGORIES = new ParseField("preferred_to_categories");
// Used for QueryPage
public static final ParseField RESULTS_FIELD = new ParseField("categories");
@ -57,6 +60,8 @@ public class CategoryDefinition implements ToXContentObject {
PARSER.declareLong(CategoryDefinition::setMaxMatchingLength, MAX_MATCHING_LENGTH);
PARSER.declareStringArray(CategoryDefinition::setExamples, EXAMPLES);
PARSER.declareString(CategoryDefinition::setGrokPattern, GROK_PATTERN);
PARSER.declareLong(CategoryDefinition::setNumMatches, NUM_MATCHES);
PARSER.declareLongArray(CategoryDefinition::setPreferredToCategories, PREFERRED_TO_CATEGORIES);
}
private final String jobId;
@ -66,6 +71,8 @@ public class CategoryDefinition implements ToXContentObject {
private long maxMatchingLength = 0L;
private final Set<String> examples = new TreeSet<>();
private String grokPattern;
private long numMatches = 0L;
private List<Long> preferredToCategories;
CategoryDefinition(String jobId) {
this.jobId = jobId;
@ -128,6 +135,22 @@ public class CategoryDefinition implements ToXContentObject {
this.grokPattern = grokPattern;
}
public long getNumMatches() {
return numMatches;
}
public void setNumMatches(long numMatches) {
this.numMatches = numMatches;
}
public List<Long> getPreferredToCategories() {
return preferredToCategories;
}
public void setPreferredToCategories(List<Long> preferredToCategories) {
this.preferredToCategories = Collections.unmodifiableList(preferredToCategories);
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
@ -140,6 +163,10 @@ public class CategoryDefinition implements ToXContentObject {
if (grokPattern != null) {
builder.field(GROK_PATTERN.getPreferredName(), grokPattern);
}
builder.field(NUM_MATCHES.getPreferredName(), numMatches);
if (preferredToCategories != null && (preferredToCategories.isEmpty() == false)) {
builder.field(PREFERRED_TO_CATEGORIES.getPreferredName(), preferredToCategories);
}
builder.endObject();
return builder;
}
@ -154,16 +181,18 @@ public class CategoryDefinition implements ToXContentObject {
}
CategoryDefinition that = (CategoryDefinition) other;
return Objects.equals(this.jobId, that.jobId)
&& Objects.equals(this.categoryId, that.categoryId)
&& Objects.equals(this.terms, that.terms)
&& Objects.equals(this.regex, that.regex)
&& Objects.equals(this.maxMatchingLength, that.maxMatchingLength)
&& Objects.equals(this.examples, that.examples)
&& Objects.equals(this.grokPattern, that.grokPattern);
&& Objects.equals(this.categoryId, that.categoryId)
&& Objects.equals(this.terms, that.terms)
&& Objects.equals(this.regex, that.regex)
&& Objects.equals(this.maxMatchingLength, that.maxMatchingLength)
&& Objects.equals(this.examples, that.examples)
&& Objects.equals(this.preferredToCategories, that.preferredToCategories)
&& Objects.equals(this.numMatches, that.numMatches)
&& Objects.equals(this.grokPattern, that.grokPattern);
}
@Override
public int hashCode() {
return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples, grokPattern);
return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples, preferredToCategories, numMatches, grokPattern);
}
}

View File

@ -20,8 +20,11 @@ package org.elasticsearch.client.ml.job.results;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.test.AbstractXContentTestCase;
import org.elasticsearch.test.ESTestCase;
import java.util.Arrays;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class CategoryDefinitionTests extends AbstractXContentTestCase<CategoryDefinition> {
@ -35,6 +38,14 @@ public class CategoryDefinitionTests extends AbstractXContentTestCase<CategoryDe
if (randomBoolean()) {
categoryDefinition.setGrokPattern(randomAlphaOfLength(50));
}
if (randomBoolean()) {
categoryDefinition.setNumMatches(randomNonNegativeLong());
}
if (randomBoolean()) {
categoryDefinition.setPreferredToCategories(Stream.generate(ESTestCase::randomNonNegativeLong)
.limit(10)
.collect(Collectors.toList()));
}
return categoryDefinition;
}

View File

@ -95,6 +95,16 @@ category.
(string) A space separated list of the common tokens that are matched in values
of the category.
`num_matches`::
(long) The number of messages that have been matched by this category. This is
only guaranteed to have the latest accurate count after a job `_flush` or `_close`
`preferred_to_categories`::
(list) A list of `category_id` entries that this current category encompasses.
Any new message that is processed by the categorizer will match against this
category and not any of the categories in this list. This is only guaranteed
to have the latest accurate list of categories after a job `_flush` or `_close`
[[ml-get-category-example]]
==== {api-examples-title}

View File

@ -17,6 +17,7 @@ import org.elasticsearch.xpack.core.ml.job.config.Job;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
@ -36,6 +37,8 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
public static final ParseField MAX_MATCHING_LENGTH = new ParseField("max_matching_length");
public static final ParseField EXAMPLES = new ParseField("examples");
public static final ParseField GROK_PATTERN = new ParseField("grok_pattern");
public static final ParseField NUM_MATCHES = new ParseField("num_matches");
public static final ParseField PREFERRED_TO_CATEGORIES = new ParseField("preferred_to_categories");
// Used for QueryPage
public static final ParseField RESULTS_FIELD = new ParseField("categories");
@ -54,7 +57,8 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
parser.declareLong(CategoryDefinition::setMaxMatchingLength, MAX_MATCHING_LENGTH);
parser.declareStringArray(CategoryDefinition::setExamples, EXAMPLES);
parser.declareString(CategoryDefinition::setGrokPattern, GROK_PATTERN);
parser.declareLongArray(CategoryDefinition::setPreferredToCategories, PREFERRED_TO_CATEGORIES);
parser.declareLong(CategoryDefinition::setNumMatches, NUM_MATCHES);
return parser;
}
@ -65,6 +69,8 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
private long maxMatchingLength = 0L;
private final Set<String> examples;
private String grokPattern;
private long[] preferredToCategories = new long[0];
private long numMatches = 0L;
public CategoryDefinition(String jobId) {
this.jobId = jobId;
@ -81,6 +87,10 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
if (in.getVersion().onOrAfter(Version.V_6_4_0)) {
grokPattern = in.readOptionalString();
}
if (in.getVersion().onOrAfter(Version.V_7_8_0)) {
this.preferredToCategories = in.readVLongArray();
this.numMatches = in.readVLong();
}
}
@Override
@ -94,6 +104,10 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
if (out.getVersion().onOrAfter(Version.V_6_4_0)) {
out.writeOptionalString(grokPattern);
}
if (out.getVersion().onOrAfter(Version.V_7_8_0)) {
out.writeVLongArray(preferredToCategories);
out.writeVLong(numMatches);
}
}
public String getJobId() {
@ -157,6 +171,34 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
this.grokPattern = grokPattern;
}
public long[] getPreferredToCategories() {
return preferredToCategories;
}
public void setPreferredToCategories(long[] preferredToCategories) {
for (long category : preferredToCategories) {
if (category < 0) {
throw new IllegalArgumentException("[preferred_to_category] entries must be non-negative");
}
}
this.preferredToCategories = preferredToCategories;
}
private void setPreferredToCategories(List<Long> preferredToCategories) {
setPreferredToCategories(preferredToCategories.stream().mapToLong(Long::longValue).toArray());
}
public long getNumMatches() {
return numMatches;
}
public void setNumMatches(long numMatches) {
if (numMatches < 0) {
throw new IllegalArgumentException("[num_matches] must be non-negative");
}
this.numMatches = numMatches;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
@ -169,6 +211,12 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
if (grokPattern != null) {
builder.field(GROK_PATTERN.getPreferredName(), grokPattern);
}
if (preferredToCategories.length > 0) {
builder.field(PREFERRED_TO_CATEGORIES.getPreferredName(), preferredToCategories);
}
if (numMatches > 0) {
builder.field(NUM_MATCHES.getPreferredName(), numMatches);
}
builder.endObject();
return builder;
}
@ -188,11 +236,21 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
&& Objects.equals(this.regex, that.regex)
&& Objects.equals(this.maxMatchingLength, that.maxMatchingLength)
&& Objects.equals(this.examples, that.examples)
&& Objects.equals(this.grokPattern, that.grokPattern);
&& Objects.equals(this.grokPattern, that.grokPattern)
&& Arrays.equals(this.preferredToCategories, that.preferredToCategories)
&& Objects.equals(this.numMatches, that.numMatches);
}
@Override
public int hashCode() {
return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples, grokPattern);
return Objects.hash(jobId,
categoryId,
terms,
regex,
maxMatchingLength,
examples,
grokPattern,
Arrays.hashCode(preferredToCategories),
numMatches);
}
}

View File

@ -118,6 +118,8 @@ public final class ReservedFieldNames {
CategoryDefinition.REGEX.getPreferredName(),
CategoryDefinition.MAX_MATCHING_LENGTH.getPreferredName(),
CategoryDefinition.EXAMPLES.getPreferredName(),
CategoryDefinition.NUM_MATCHES.getPreferredName(),
CategoryDefinition.PREFERRED_TO_CATEGORIES.getPreferredName(),
DataCounts.PROCESSED_RECORD_COUNT.getPreferredName(),
DataCounts.PROCESSED_FIELD_COUNT.getPreferredName(),

View File

@ -385,6 +385,9 @@
"multi_bucket_impact" : {
"type" : "double"
},
"num_matches": {
"type" : "long"
},
"out_of_order_timestamp_count" : {
"type" : "long"
},
@ -406,6 +409,9 @@
"all_field_values"
]
},
"preferred_to_categories": {
"type": "long"
},
"probability" : {
"type" : "double"
},

View File

@ -28,6 +28,7 @@ import java.util.Locale;
import static org.elasticsearch.index.mapper.MapperService.SINGLE_MAPPING_NAME;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;
/**
@ -206,6 +207,93 @@ public class CategorizationIT extends MlNativeAutodetectIntegTestCase {
(MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA ? "Java" : "C++") + " took " + duration + "ms");
}
@AwaitsFix(bugUrl = "https://github.com/elastic/ml-cpp/pull/1062")
public void testNumMatchesAndCategoryPreference() throws Exception {
String index = "hadoop_logs";
client().admin().indices().prepareCreate(index)
.addMapping(SINGLE_MAPPING_NAME, "time", "type=date,format=epoch_millis",
"msg", "type=text")
.get();
nowMillis = System.currentTimeMillis();
BulkRequestBuilder bulkRequestBuilder = client().prepareBulk();
IndexRequest indexRequest = new IndexRequest(index);
indexRequest.source("time", nowMillis - TimeValue.timeValueHours(8).millis(),
"msg", "2015-10-18 18:01:51,963 INFO [main] org.mortbay.log: jetty-6.1.26");
bulkRequestBuilder.add(indexRequest);
indexRequest = new IndexRequest(index);
indexRequest.source("time", nowMillis - TimeValue.timeValueHours(7).millis(),
"msg",
"2015-10-18 18:01:52,728 INFO [main] org.mortbay.log: Started HttpServer2$SelectChannelConnectorWithSafeStartup@0.0.0.0:62267");
bulkRequestBuilder.add(indexRequest);
indexRequest = new IndexRequest(index);
indexRequest.source("time", nowMillis - TimeValue.timeValueHours(6).millis(),
"msg", "2015-10-18 18:01:53,400 INFO [main] org.apache.hadoop.yarn.webapp.WebApps: Registered webapp guice modules");
bulkRequestBuilder.add(indexRequest);
indexRequest = new IndexRequest(index);
indexRequest.source("time", nowMillis - TimeValue.timeValueHours(5).millis(),
"msg",
"2015-10-18 18:01:53,447 INFO [main] org.apache.hadoop.mapreduce.v2.app.rm.RMContainerRequestor: nodeBlacklistingEnabled:true");
bulkRequestBuilder.add(indexRequest);
indexRequest = new IndexRequest(index);
indexRequest.source("time", nowMillis - TimeValue.timeValueHours(4).millis(),
"msg",
"2015-10-18 18:01:52,728 INFO [main] org.apache.hadoop.yarn.webapp.WebApps: Web app /mapreduce started at 62267");
bulkRequestBuilder.add(indexRequest);
indexRequest = new IndexRequest(index);
indexRequest.source("time", nowMillis - TimeValue.timeValueHours(2).millis(),
"msg",
"2015-10-18 18:01:53,557 INFO [main] org.apache.hadoop.yarn.client.RMProxy: " +
"Connecting to ResourceManager at msra-sa-41/10.190.173.170:8030");
bulkRequestBuilder.add(indexRequest);
indexRequest = new IndexRequest(index);
indexRequest.source("time", nowMillis - TimeValue.timeValueHours(1).millis(),
"msg",
"2015-10-18 18:01:53,713 INFO [main] org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: " +
"maxContainerCapability: <memory:8192, vCores:32>");
bulkRequestBuilder.add(indexRequest);
indexRequest = new IndexRequest(index);
indexRequest.source("time", nowMillis,
"msg",
"2015-10-18 18:01:53,713 INFO [main] org.apache.hadoop.yarn.client.api.impl.ContainerManagementProtocolProxy: " +
"yarn.client.max-cached-nodemanagers-proxies : 0");
bulkRequestBuilder.add(indexRequest);
BulkResponse bulkResponse = bulkRequestBuilder
.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE)
.get();
assertThat(bulkResponse.hasFailures(), is(false));
Job.Builder job = newJobBuilder("categorization-with-preferred-categories", Collections.emptyList());
registerJob(job);
putJob(job);
openJob(job.getId());
String datafeedId = job.getId() + "-feed";
DatafeedConfig.Builder datafeedConfig = new DatafeedConfig.Builder(datafeedId, job.getId());
datafeedConfig.setIndices(Collections.singletonList(index));
DatafeedConfig datafeed = datafeedConfig.build();
registerDatafeed(datafeed);
putDatafeed(datafeed);
startDatafeed(datafeedId, 0, nowMillis + 1);
waitUntilJobIsClosed(job.getId());
List<CategoryDefinition> categories = getCategories(job.getId());
assertThat(categories, hasSize(7));
CategoryDefinition category1 = categories.get(0);
assertThat(category1.getNumMatches(), equalTo(2L));
long[] expectedPreferenceTo = new long[]{2L, 3L, 4L, 5L, 6L};
assertThat(category1.getPreferredToCategories().length, equalTo(5));
assertThat(category1.getPreferredToCategories(), equalTo(expectedPreferenceTo));
client().admin().indices().prepareDelete(index).get();
}
private static Job.Builder newJobBuilder(String id, List<String> categorizationFilters) {
Detector.Builder detector = new Detector.Builder();
detector.setFunction("count");

View File

@ -5,18 +5,21 @@
*/
package org.elasticsearch.xpack.ml.job.results;
import org.elasticsearch.Version;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.json.JsonXContent;
import org.elasticsearch.test.AbstractSerializingTestCase;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.xpack.core.ml.AbstractBWCSerializationTestCase;
import org.elasticsearch.xpack.core.ml.job.results.CategoryDefinition;
import java.io.IOException;
import java.util.Arrays;
import java.util.stream.LongStream;
import static org.hamcrest.Matchers.containsString;
public class CategoryDefinitionTests extends AbstractSerializingTestCase<CategoryDefinition> {
public class CategoryDefinitionTests extends AbstractBWCSerializationTestCase<CategoryDefinition> {
public CategoryDefinition createTestInstance(String jobId) {
CategoryDefinition categoryDefinition = new CategoryDefinition(jobId);
@ -28,6 +31,12 @@ public class CategoryDefinitionTests extends AbstractSerializingTestCase<Categor
if (randomBoolean()) {
categoryDefinition.setGrokPattern(randomAlphaOfLength(50));
}
if (randomBoolean()) {
categoryDefinition.setNumMatches(randomNonNegativeLong());
}
if (randomBoolean()) {
categoryDefinition.setPreferredToCategories(LongStream.generate(ESTestCase::randomNonNegativeLong).limit(10).toArray());
}
return categoryDefinition;
}
@ -145,4 +154,16 @@ public class CategoryDefinitionTests extends AbstractSerializingTestCase<Categor
CategoryDefinition.LENIENT_PARSER.apply(parser, null);
}
}
@Override
protected CategoryDefinition mutateInstanceForVersion(CategoryDefinition instance, Version version) {
if (version.before(Version.V_6_4_0)) {
instance.setGrokPattern(null);
}
if (version.before(Version.V_7_8_0)) {
instance.setPreferredToCategories(new long[0]);
instance.setNumMatches(0L);
}
return instance;
}
}