[7.x] [ML] add num_matches and preferred_to_categories to category defintion objects (#54214) (#54639)
* [ML] add num_matches and preferred_to_categories to category defintion objects (#54214) This adds two new fields to category definitions. - `num_matches` indicating how many documents have been seen by this category - `preferred_to_categories` indicating which other categories this particular category supersedes when messages are categorized. These fields are only guaranteed to be up to date after a `_flush` or `_close` native change: https://github.com/elastic/ml-cpp/pull/1062 * adjusting for backport
This commit is contained in:
parent
54ecb009bb
commit
eb31be0e71
|
@ -27,6 +27,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
@ -42,6 +43,8 @@ public class CategoryDefinition implements ToXContentObject {
|
|||
public static final ParseField MAX_MATCHING_LENGTH = new ParseField("max_matching_length");
|
||||
public static final ParseField EXAMPLES = new ParseField("examples");
|
||||
public static final ParseField GROK_PATTERN = new ParseField("grok_pattern");
|
||||
public static final ParseField NUM_MATCHES = new ParseField("num_matches");
|
||||
public static final ParseField PREFERRED_TO_CATEGORIES = new ParseField("preferred_to_categories");
|
||||
|
||||
// Used for QueryPage
|
||||
public static final ParseField RESULTS_FIELD = new ParseField("categories");
|
||||
|
@ -57,6 +60,8 @@ public class CategoryDefinition implements ToXContentObject {
|
|||
PARSER.declareLong(CategoryDefinition::setMaxMatchingLength, MAX_MATCHING_LENGTH);
|
||||
PARSER.declareStringArray(CategoryDefinition::setExamples, EXAMPLES);
|
||||
PARSER.declareString(CategoryDefinition::setGrokPattern, GROK_PATTERN);
|
||||
PARSER.declareLong(CategoryDefinition::setNumMatches, NUM_MATCHES);
|
||||
PARSER.declareLongArray(CategoryDefinition::setPreferredToCategories, PREFERRED_TO_CATEGORIES);
|
||||
}
|
||||
|
||||
private final String jobId;
|
||||
|
@ -66,6 +71,8 @@ public class CategoryDefinition implements ToXContentObject {
|
|||
private long maxMatchingLength = 0L;
|
||||
private final Set<String> examples = new TreeSet<>();
|
||||
private String grokPattern;
|
||||
private long numMatches = 0L;
|
||||
private List<Long> preferredToCategories;
|
||||
|
||||
CategoryDefinition(String jobId) {
|
||||
this.jobId = jobId;
|
||||
|
@ -128,6 +135,22 @@ public class CategoryDefinition implements ToXContentObject {
|
|||
this.grokPattern = grokPattern;
|
||||
}
|
||||
|
||||
public long getNumMatches() {
|
||||
return numMatches;
|
||||
}
|
||||
|
||||
public void setNumMatches(long numMatches) {
|
||||
this.numMatches = numMatches;
|
||||
}
|
||||
|
||||
public List<Long> getPreferredToCategories() {
|
||||
return preferredToCategories;
|
||||
}
|
||||
|
||||
public void setPreferredToCategories(List<Long> preferredToCategories) {
|
||||
this.preferredToCategories = Collections.unmodifiableList(preferredToCategories);
|
||||
}
|
||||
|
||||
@Override
|
||||
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
|
||||
builder.startObject();
|
||||
|
@ -140,6 +163,10 @@ public class CategoryDefinition implements ToXContentObject {
|
|||
if (grokPattern != null) {
|
||||
builder.field(GROK_PATTERN.getPreferredName(), grokPattern);
|
||||
}
|
||||
builder.field(NUM_MATCHES.getPreferredName(), numMatches);
|
||||
if (preferredToCategories != null && (preferredToCategories.isEmpty() == false)) {
|
||||
builder.field(PREFERRED_TO_CATEGORIES.getPreferredName(), preferredToCategories);
|
||||
}
|
||||
builder.endObject();
|
||||
return builder;
|
||||
}
|
||||
|
@ -159,11 +186,13 @@ public class CategoryDefinition implements ToXContentObject {
|
|||
&& Objects.equals(this.regex, that.regex)
|
||||
&& Objects.equals(this.maxMatchingLength, that.maxMatchingLength)
|
||||
&& Objects.equals(this.examples, that.examples)
|
||||
&& Objects.equals(this.preferredToCategories, that.preferredToCategories)
|
||||
&& Objects.equals(this.numMatches, that.numMatches)
|
||||
&& Objects.equals(this.grokPattern, that.grokPattern);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples, grokPattern);
|
||||
return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples, preferredToCategories, numMatches, grokPattern);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,11 @@ package org.elasticsearch.client.ml.job.results;
|
|||
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.test.AbstractXContentTestCase;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class CategoryDefinitionTests extends AbstractXContentTestCase<CategoryDefinition> {
|
||||
|
||||
|
@ -35,6 +38,14 @@ public class CategoryDefinitionTests extends AbstractXContentTestCase<CategoryDe
|
|||
if (randomBoolean()) {
|
||||
categoryDefinition.setGrokPattern(randomAlphaOfLength(50));
|
||||
}
|
||||
if (randomBoolean()) {
|
||||
categoryDefinition.setNumMatches(randomNonNegativeLong());
|
||||
}
|
||||
if (randomBoolean()) {
|
||||
categoryDefinition.setPreferredToCategories(Stream.generate(ESTestCase::randomNonNegativeLong)
|
||||
.limit(10)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
return categoryDefinition;
|
||||
}
|
||||
|
||||
|
|
|
@ -95,6 +95,16 @@ category.
|
|||
(string) A space separated list of the common tokens that are matched in values
|
||||
of the category.
|
||||
|
||||
`num_matches`::
|
||||
(long) The number of messages that have been matched by this category. This is
|
||||
only guaranteed to have the latest accurate count after a job `_flush` or `_close`
|
||||
|
||||
`preferred_to_categories`::
|
||||
(list) A list of `category_id` entries that this current category encompasses.
|
||||
Any new message that is processed by the categorizer will match against this
|
||||
category and not any of the categories in this list. This is only guaranteed
|
||||
to have the latest accurate list of categories after a job `_flush` or `_close`
|
||||
|
||||
[[ml-get-category-example]]
|
||||
==== {api-examples-title}
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ import org.elasticsearch.xpack.core.ml.job.config.Job;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
@ -36,6 +37,8 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
|
|||
public static final ParseField MAX_MATCHING_LENGTH = new ParseField("max_matching_length");
|
||||
public static final ParseField EXAMPLES = new ParseField("examples");
|
||||
public static final ParseField GROK_PATTERN = new ParseField("grok_pattern");
|
||||
public static final ParseField NUM_MATCHES = new ParseField("num_matches");
|
||||
public static final ParseField PREFERRED_TO_CATEGORIES = new ParseField("preferred_to_categories");
|
||||
|
||||
// Used for QueryPage
|
||||
public static final ParseField RESULTS_FIELD = new ParseField("categories");
|
||||
|
@ -54,7 +57,8 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
|
|||
parser.declareLong(CategoryDefinition::setMaxMatchingLength, MAX_MATCHING_LENGTH);
|
||||
parser.declareStringArray(CategoryDefinition::setExamples, EXAMPLES);
|
||||
parser.declareString(CategoryDefinition::setGrokPattern, GROK_PATTERN);
|
||||
|
||||
parser.declareLongArray(CategoryDefinition::setPreferredToCategories, PREFERRED_TO_CATEGORIES);
|
||||
parser.declareLong(CategoryDefinition::setNumMatches, NUM_MATCHES);
|
||||
return parser;
|
||||
}
|
||||
|
||||
|
@ -65,6 +69,8 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
|
|||
private long maxMatchingLength = 0L;
|
||||
private final Set<String> examples;
|
||||
private String grokPattern;
|
||||
private long[] preferredToCategories = new long[0];
|
||||
private long numMatches = 0L;
|
||||
|
||||
public CategoryDefinition(String jobId) {
|
||||
this.jobId = jobId;
|
||||
|
@ -81,6 +87,10 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
|
|||
if (in.getVersion().onOrAfter(Version.V_6_4_0)) {
|
||||
grokPattern = in.readOptionalString();
|
||||
}
|
||||
if (in.getVersion().onOrAfter(Version.V_7_8_0)) {
|
||||
this.preferredToCategories = in.readVLongArray();
|
||||
this.numMatches = in.readVLong();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -94,6 +104,10 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
|
|||
if (out.getVersion().onOrAfter(Version.V_6_4_0)) {
|
||||
out.writeOptionalString(grokPattern);
|
||||
}
|
||||
if (out.getVersion().onOrAfter(Version.V_7_8_0)) {
|
||||
out.writeVLongArray(preferredToCategories);
|
||||
out.writeVLong(numMatches);
|
||||
}
|
||||
}
|
||||
|
||||
public String getJobId() {
|
||||
|
@ -157,6 +171,34 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
|
|||
this.grokPattern = grokPattern;
|
||||
}
|
||||
|
||||
public long[] getPreferredToCategories() {
|
||||
return preferredToCategories;
|
||||
}
|
||||
|
||||
public void setPreferredToCategories(long[] preferredToCategories) {
|
||||
for (long category : preferredToCategories) {
|
||||
if (category < 0) {
|
||||
throw new IllegalArgumentException("[preferred_to_category] entries must be non-negative");
|
||||
}
|
||||
}
|
||||
this.preferredToCategories = preferredToCategories;
|
||||
}
|
||||
|
||||
private void setPreferredToCategories(List<Long> preferredToCategories) {
|
||||
setPreferredToCategories(preferredToCategories.stream().mapToLong(Long::longValue).toArray());
|
||||
}
|
||||
|
||||
public long getNumMatches() {
|
||||
return numMatches;
|
||||
}
|
||||
|
||||
public void setNumMatches(long numMatches) {
|
||||
if (numMatches < 0) {
|
||||
throw new IllegalArgumentException("[num_matches] must be non-negative");
|
||||
}
|
||||
this.numMatches = numMatches;
|
||||
}
|
||||
|
||||
@Override
|
||||
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
|
||||
builder.startObject();
|
||||
|
@ -169,6 +211,12 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
|
|||
if (grokPattern != null) {
|
||||
builder.field(GROK_PATTERN.getPreferredName(), grokPattern);
|
||||
}
|
||||
if (preferredToCategories.length > 0) {
|
||||
builder.field(PREFERRED_TO_CATEGORIES.getPreferredName(), preferredToCategories);
|
||||
}
|
||||
if (numMatches > 0) {
|
||||
builder.field(NUM_MATCHES.getPreferredName(), numMatches);
|
||||
}
|
||||
builder.endObject();
|
||||
return builder;
|
||||
}
|
||||
|
@ -188,11 +236,21 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
|
|||
&& Objects.equals(this.regex, that.regex)
|
||||
&& Objects.equals(this.maxMatchingLength, that.maxMatchingLength)
|
||||
&& Objects.equals(this.examples, that.examples)
|
||||
&& Objects.equals(this.grokPattern, that.grokPattern);
|
||||
&& Objects.equals(this.grokPattern, that.grokPattern)
|
||||
&& Arrays.equals(this.preferredToCategories, that.preferredToCategories)
|
||||
&& Objects.equals(this.numMatches, that.numMatches);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples, grokPattern);
|
||||
return Objects.hash(jobId,
|
||||
categoryId,
|
||||
terms,
|
||||
regex,
|
||||
maxMatchingLength,
|
||||
examples,
|
||||
grokPattern,
|
||||
Arrays.hashCode(preferredToCategories),
|
||||
numMatches);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -118,6 +118,8 @@ public final class ReservedFieldNames {
|
|||
CategoryDefinition.REGEX.getPreferredName(),
|
||||
CategoryDefinition.MAX_MATCHING_LENGTH.getPreferredName(),
|
||||
CategoryDefinition.EXAMPLES.getPreferredName(),
|
||||
CategoryDefinition.NUM_MATCHES.getPreferredName(),
|
||||
CategoryDefinition.PREFERRED_TO_CATEGORIES.getPreferredName(),
|
||||
|
||||
DataCounts.PROCESSED_RECORD_COUNT.getPreferredName(),
|
||||
DataCounts.PROCESSED_FIELD_COUNT.getPreferredName(),
|
||||
|
|
|
@ -385,6 +385,9 @@
|
|||
"multi_bucket_impact" : {
|
||||
"type" : "double"
|
||||
},
|
||||
"num_matches": {
|
||||
"type" : "long"
|
||||
},
|
||||
"out_of_order_timestamp_count" : {
|
||||
"type" : "long"
|
||||
},
|
||||
|
@ -406,6 +409,9 @@
|
|||
"all_field_values"
|
||||
]
|
||||
},
|
||||
"preferred_to_categories": {
|
||||
"type": "long"
|
||||
},
|
||||
"probability" : {
|
||||
"type" : "double"
|
||||
},
|
||||
|
|
|
@ -28,6 +28,7 @@ import java.util.Locale;
|
|||
|
||||
import static org.elasticsearch.index.mapper.MapperService.SINGLE_MAPPING_NAME;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.hasSize;
|
||||
import static org.hamcrest.Matchers.is;
|
||||
|
||||
/**
|
||||
|
@ -206,6 +207,93 @@ public class CategorizationIT extends MlNativeAutodetectIntegTestCase {
|
|||
(MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA ? "Java" : "C++") + " took " + duration + "ms");
|
||||
}
|
||||
|
||||
@AwaitsFix(bugUrl = "https://github.com/elastic/ml-cpp/pull/1062")
|
||||
public void testNumMatchesAndCategoryPreference() throws Exception {
|
||||
String index = "hadoop_logs";
|
||||
client().admin().indices().prepareCreate(index)
|
||||
.addMapping(SINGLE_MAPPING_NAME, "time", "type=date,format=epoch_millis",
|
||||
"msg", "type=text")
|
||||
.get();
|
||||
|
||||
nowMillis = System.currentTimeMillis();
|
||||
|
||||
BulkRequestBuilder bulkRequestBuilder = client().prepareBulk();
|
||||
IndexRequest indexRequest = new IndexRequest(index);
|
||||
indexRequest.source("time", nowMillis - TimeValue.timeValueHours(8).millis(),
|
||||
"msg", "2015-10-18 18:01:51,963 INFO [main] org.mortbay.log: jetty-6.1.26");
|
||||
bulkRequestBuilder.add(indexRequest);
|
||||
indexRequest = new IndexRequest(index);
|
||||
indexRequest.source("time", nowMillis - TimeValue.timeValueHours(7).millis(),
|
||||
"msg",
|
||||
"2015-10-18 18:01:52,728 INFO [main] org.mortbay.log: Started HttpServer2$SelectChannelConnectorWithSafeStartup@0.0.0.0:62267");
|
||||
bulkRequestBuilder.add(indexRequest);
|
||||
indexRequest = new IndexRequest(index);
|
||||
indexRequest.source("time", nowMillis - TimeValue.timeValueHours(6).millis(),
|
||||
"msg", "2015-10-18 18:01:53,400 INFO [main] org.apache.hadoop.yarn.webapp.WebApps: Registered webapp guice modules");
|
||||
bulkRequestBuilder.add(indexRequest);
|
||||
indexRequest = new IndexRequest(index);
|
||||
indexRequest.source("time", nowMillis - TimeValue.timeValueHours(5).millis(),
|
||||
"msg",
|
||||
"2015-10-18 18:01:53,447 INFO [main] org.apache.hadoop.mapreduce.v2.app.rm.RMContainerRequestor: nodeBlacklistingEnabled:true");
|
||||
bulkRequestBuilder.add(indexRequest);
|
||||
indexRequest = new IndexRequest(index);
|
||||
indexRequest.source("time", nowMillis - TimeValue.timeValueHours(4).millis(),
|
||||
"msg",
|
||||
"2015-10-18 18:01:52,728 INFO [main] org.apache.hadoop.yarn.webapp.WebApps: Web app /mapreduce started at 62267");
|
||||
bulkRequestBuilder.add(indexRequest);
|
||||
indexRequest = new IndexRequest(index);
|
||||
indexRequest.source("time", nowMillis - TimeValue.timeValueHours(2).millis(),
|
||||
"msg",
|
||||
"2015-10-18 18:01:53,557 INFO [main] org.apache.hadoop.yarn.client.RMProxy: " +
|
||||
"Connecting to ResourceManager at msra-sa-41/10.190.173.170:8030");
|
||||
bulkRequestBuilder.add(indexRequest);
|
||||
|
||||
indexRequest = new IndexRequest(index);
|
||||
indexRequest.source("time", nowMillis - TimeValue.timeValueHours(1).millis(),
|
||||
"msg",
|
||||
"2015-10-18 18:01:53,713 INFO [main] org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: " +
|
||||
"maxContainerCapability: <memory:8192, vCores:32>");
|
||||
bulkRequestBuilder.add(indexRequest);
|
||||
|
||||
indexRequest = new IndexRequest(index);
|
||||
indexRequest.source("time", nowMillis,
|
||||
"msg",
|
||||
"2015-10-18 18:01:53,713 INFO [main] org.apache.hadoop.yarn.client.api.impl.ContainerManagementProtocolProxy: " +
|
||||
"yarn.client.max-cached-nodemanagers-proxies : 0");
|
||||
bulkRequestBuilder.add(indexRequest);
|
||||
|
||||
|
||||
BulkResponse bulkResponse = bulkRequestBuilder
|
||||
.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE)
|
||||
.get();
|
||||
assertThat(bulkResponse.hasFailures(), is(false));
|
||||
|
||||
Job.Builder job = newJobBuilder("categorization-with-preferred-categories", Collections.emptyList());
|
||||
registerJob(job);
|
||||
putJob(job);
|
||||
openJob(job.getId());
|
||||
|
||||
String datafeedId = job.getId() + "-feed";
|
||||
DatafeedConfig.Builder datafeedConfig = new DatafeedConfig.Builder(datafeedId, job.getId());
|
||||
datafeedConfig.setIndices(Collections.singletonList(index));
|
||||
DatafeedConfig datafeed = datafeedConfig.build();
|
||||
registerDatafeed(datafeed);
|
||||
putDatafeed(datafeed);
|
||||
startDatafeed(datafeedId, 0, nowMillis + 1);
|
||||
waitUntilJobIsClosed(job.getId());
|
||||
|
||||
List<CategoryDefinition> categories = getCategories(job.getId());
|
||||
assertThat(categories, hasSize(7));
|
||||
|
||||
CategoryDefinition category1 = categories.get(0);
|
||||
assertThat(category1.getNumMatches(), equalTo(2L));
|
||||
long[] expectedPreferenceTo = new long[]{2L, 3L, 4L, 5L, 6L};
|
||||
assertThat(category1.getPreferredToCategories().length, equalTo(5));
|
||||
assertThat(category1.getPreferredToCategories(), equalTo(expectedPreferenceTo));
|
||||
client().admin().indices().prepareDelete(index).get();
|
||||
}
|
||||
|
||||
|
||||
private static Job.Builder newJobBuilder(String id, List<String> categorizationFilters) {
|
||||
Detector.Builder detector = new Detector.Builder();
|
||||
detector.setFunction("count");
|
||||
|
|
|
@ -5,18 +5,21 @@
|
|||
*/
|
||||
package org.elasticsearch.xpack.ml.job.results;
|
||||
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.io.stream.Writeable;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.common.xcontent.json.JsonXContent;
|
||||
import org.elasticsearch.test.AbstractSerializingTestCase;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.xpack.core.ml.AbstractBWCSerializationTestCase;
|
||||
import org.elasticsearch.xpack.core.ml.job.results.CategoryDefinition;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
import static org.hamcrest.Matchers.containsString;
|
||||
|
||||
public class CategoryDefinitionTests extends AbstractSerializingTestCase<CategoryDefinition> {
|
||||
public class CategoryDefinitionTests extends AbstractBWCSerializationTestCase<CategoryDefinition> {
|
||||
|
||||
public CategoryDefinition createTestInstance(String jobId) {
|
||||
CategoryDefinition categoryDefinition = new CategoryDefinition(jobId);
|
||||
|
@ -28,6 +31,12 @@ public class CategoryDefinitionTests extends AbstractSerializingTestCase<Categor
|
|||
if (randomBoolean()) {
|
||||
categoryDefinition.setGrokPattern(randomAlphaOfLength(50));
|
||||
}
|
||||
if (randomBoolean()) {
|
||||
categoryDefinition.setNumMatches(randomNonNegativeLong());
|
||||
}
|
||||
if (randomBoolean()) {
|
||||
categoryDefinition.setPreferredToCategories(LongStream.generate(ESTestCase::randomNonNegativeLong).limit(10).toArray());
|
||||
}
|
||||
return categoryDefinition;
|
||||
}
|
||||
|
||||
|
@ -145,4 +154,16 @@ public class CategoryDefinitionTests extends AbstractSerializingTestCase<Categor
|
|||
CategoryDefinition.LENIENT_PARSER.apply(parser, null);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected CategoryDefinition mutateInstanceForVersion(CategoryDefinition instance, Version version) {
|
||||
if (version.before(Version.V_6_4_0)) {
|
||||
instance.setGrokPattern(null);
|
||||
}
|
||||
if (version.before(Version.V_7_8_0)) {
|
||||
instance.setPreferredToCategories(new long[0]);
|
||||
instance.setNumMatches(0L);
|
||||
}
|
||||
return instance;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue