mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-17 10:25:15 +00:00
[ML] Add a file structure determination endpoint (#33471)
This endpoint accepts an arbitrary file in the request body and attempts to determine the structure. If successful it also proposes mappings that could be used when indexing the file's contents, and calculates simple statistics for each of the fields that are useful in the data preparation step prior to configuring machine learning jobs.
This commit is contained in:
parent
4d233107f8
commit
e42cc5cd8c
@ -56,6 +56,7 @@ import org.elasticsearch.xpack.core.ml.action.DeleteForecastAction;
|
||||
import org.elasticsearch.xpack.core.ml.action.DeleteJobAction;
|
||||
import org.elasticsearch.xpack.core.ml.action.DeleteModelSnapshotAction;
|
||||
import org.elasticsearch.xpack.core.ml.action.FinalizeJobExecutionAction;
|
||||
import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction;
|
||||
import org.elasticsearch.xpack.core.ml.action.FlushJobAction;
|
||||
import org.elasticsearch.xpack.core.ml.action.ForecastJobAction;
|
||||
import org.elasticsearch.xpack.core.ml.action.GetBucketsAction;
|
||||
@ -265,6 +266,7 @@ public class XPackClientPlugin extends Plugin implements ActionPlugin, NetworkPl
|
||||
GetCalendarEventsAction.INSTANCE,
|
||||
PostCalendarEventsAction.INSTANCE,
|
||||
PersistJobAction.INSTANCE,
|
||||
FindFileStructureAction.INSTANCE,
|
||||
// security
|
||||
ClearRealmCacheAction.INSTANCE,
|
||||
ClearRolesCacheAction.INSTANCE,
|
||||
|
@ -0,0 +1,183 @@
|
||||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.core.ml.action;
|
||||
|
||||
import org.elasticsearch.action.Action;
|
||||
import org.elasticsearch.action.ActionRequest;
|
||||
import org.elasticsearch.action.ActionRequestBuilder;
|
||||
import org.elasticsearch.action.ActionRequestValidationException;
|
||||
import org.elasticsearch.action.ActionResponse;
|
||||
import org.elasticsearch.client.ElasticsearchClient;
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
import org.elasticsearch.common.io.stream.Writeable;
|
||||
import org.elasticsearch.common.xcontent.StatusToXContentObject;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.rest.RestStatus;
|
||||
import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
import static org.elasticsearch.action.ValidateActions.addValidationError;
|
||||
|
||||
public class FindFileStructureAction extends Action<FindFileStructureAction.Response> {
|
||||
|
||||
public static final FindFileStructureAction INSTANCE = new FindFileStructureAction();
|
||||
public static final String NAME = "cluster:monitor/xpack/ml/findfilestructure";
|
||||
|
||||
private FindFileStructureAction() {
|
||||
super(NAME);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Response newResponse() {
|
||||
return new Response();
|
||||
}
|
||||
|
||||
static class RequestBuilder extends ActionRequestBuilder<Request, Response> {
|
||||
|
||||
RequestBuilder(ElasticsearchClient client, FindFileStructureAction action) {
|
||||
super(client, action, new Request());
|
||||
}
|
||||
}
|
||||
|
||||
public static class Response extends ActionResponse implements StatusToXContentObject, Writeable {
|
||||
|
||||
private FileStructure fileStructure;
|
||||
|
||||
public Response(FileStructure fileStructure) {
|
||||
this.fileStructure = fileStructure;
|
||||
}
|
||||
|
||||
Response() {
|
||||
}
|
||||
|
||||
public FileStructure getFileStructure() {
|
||||
return fileStructure;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFrom(StreamInput in) throws IOException {
|
||||
super.readFrom(in);
|
||||
fileStructure = new FileStructure(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
super.writeTo(out);
|
||||
fileStructure.writeTo(out);
|
||||
}
|
||||
|
||||
@Override
|
||||
public RestStatus status() {
|
||||
return RestStatus.OK;
|
||||
}
|
||||
|
||||
@Override
|
||||
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
|
||||
fileStructure.toXContent(builder, params);
|
||||
return builder;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(fileStructure);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other == null || getClass() != other.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
FindFileStructureAction.Response that = (FindFileStructureAction.Response) other;
|
||||
return Objects.equals(fileStructure, that.fileStructure);
|
||||
}
|
||||
}
|
||||
|
||||
public static class Request extends ActionRequest {
|
||||
|
||||
public static final ParseField LINES_TO_SAMPLE = new ParseField("lines_to_sample");
|
||||
|
||||
private Integer linesToSample;
|
||||
private BytesReference sample;
|
||||
|
||||
public Request() {
|
||||
}
|
||||
|
||||
public Integer getLinesToSample() {
|
||||
return linesToSample;
|
||||
}
|
||||
|
||||
public void setLinesToSample(Integer linesToSample) {
|
||||
this.linesToSample = linesToSample;
|
||||
}
|
||||
|
||||
public BytesReference getSample() {
|
||||
return sample;
|
||||
}
|
||||
|
||||
public void setSample(BytesReference sample) {
|
||||
this.sample = sample;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ActionRequestValidationException validate() {
|
||||
ActionRequestValidationException validationException = null;
|
||||
if (linesToSample != null && linesToSample <= 0) {
|
||||
validationException =
|
||||
addValidationError(LINES_TO_SAMPLE.getPreferredName() + " must be positive if specified", validationException);
|
||||
}
|
||||
if (sample == null || sample.length() == 0) {
|
||||
validationException = addValidationError("sample must be specified", validationException);
|
||||
}
|
||||
return validationException;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFrom(StreamInput in) throws IOException {
|
||||
super.readFrom(in);
|
||||
linesToSample = in.readOptionalVInt();
|
||||
sample = in.readBytesReference();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
super.writeTo(out);
|
||||
out.writeOptionalVInt(linesToSample);
|
||||
out.writeBytesReference(sample);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(linesToSample, sample);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other == null || getClass() != other.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Request that = (Request) other;
|
||||
return Objects.equals(this.linesToSample, that.linesToSample) &&
|
||||
Objects.equals(this.sample, that.sample);
|
||||
}
|
||||
}
|
||||
}
|
@ -6,6 +6,9 @@
|
||||
package org.elasticsearch.xpack.core.ml.filestructurefinder;
|
||||
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
import org.elasticsearch.common.io.stream.Writeable;
|
||||
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
|
||||
import org.elasticsearch.common.xcontent.ToXContentObject;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
@ -16,7 +19,7 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
public class FieldStats implements ToXContentObject {
|
||||
public class FieldStats implements ToXContentObject, Writeable {
|
||||
|
||||
static final ParseField COUNT = new ParseField("count");
|
||||
static final ParseField CARDINALITY = new ParseField("cardinality");
|
||||
@ -64,6 +67,27 @@ public class FieldStats implements ToXContentObject {
|
||||
this.topHits = (topHits == null) ? Collections.emptyList() : Collections.unmodifiableList(topHits);
|
||||
}
|
||||
|
||||
public FieldStats(StreamInput in) throws IOException {
|
||||
count = in.readVLong();
|
||||
cardinality = in.readVInt();
|
||||
minValue = in.readOptionalDouble();
|
||||
maxValue = in.readOptionalDouble();
|
||||
meanValue = in.readOptionalDouble();
|
||||
medianValue = in.readOptionalDouble();
|
||||
topHits = in.readList(StreamInput::readMap);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
out.writeVLong(count);
|
||||
out.writeVInt(cardinality);
|
||||
out.writeOptionalDouble(minValue);
|
||||
out.writeOptionalDouble(maxValue);
|
||||
out.writeOptionalDouble(meanValue);
|
||||
out.writeOptionalDouble(medianValue);
|
||||
out.writeCollection(topHits, StreamOutput::writeMap);
|
||||
}
|
||||
|
||||
public long getCount() {
|
||||
return count;
|
||||
}
|
||||
|
@ -6,6 +6,9 @@
|
||||
package org.elasticsearch.xpack.core.ml.filestructurefinder;
|
||||
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
import org.elasticsearch.common.io.stream.Writeable;
|
||||
import org.elasticsearch.common.xcontent.ObjectParser;
|
||||
import org.elasticsearch.common.xcontent.ToXContentObject;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
@ -24,7 +27,7 @@ import java.util.TreeMap;
|
||||
/**
|
||||
* Stores the file format determined by Machine Learning.
|
||||
*/
|
||||
public class FileStructure implements ToXContentObject {
|
||||
public class FileStructure implements ToXContentObject, Writeable {
|
||||
|
||||
public enum Format {
|
||||
|
||||
@ -79,6 +82,8 @@ public class FileStructure implements ToXContentObject {
|
||||
}
|
||||
}
|
||||
|
||||
public static final String EXPLAIN = "explain";
|
||||
|
||||
static final ParseField NUM_LINES_ANALYZED = new ParseField("num_lines_analyzed");
|
||||
static final ParseField NUM_MESSAGES_ANALYZED = new ParseField("num_messages_analyzed");
|
||||
static final ParseField SAMPLE_START = new ParseField("sample_start");
|
||||
@ -176,6 +181,66 @@ public class FileStructure implements ToXContentObject {
|
||||
this.explanation = Collections.unmodifiableList(new ArrayList<>(explanation));
|
||||
}
|
||||
|
||||
public FileStructure(StreamInput in) throws IOException {
|
||||
numLinesAnalyzed = in.readVInt();
|
||||
numMessagesAnalyzed = in.readVInt();
|
||||
sampleStart = in.readString();
|
||||
charset = in.readString();
|
||||
hasByteOrderMarker = in.readOptionalBoolean();
|
||||
format = in.readEnum(Format.class);
|
||||
multilineStartPattern = in.readOptionalString();
|
||||
excludeLinesPattern = in.readOptionalString();
|
||||
inputFields = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null;
|
||||
hasHeaderRow = in.readOptionalBoolean();
|
||||
delimiter = in.readBoolean() ? (char) in.readVInt() : null;
|
||||
shouldTrimFields = in.readOptionalBoolean();
|
||||
grokPattern = in.readOptionalString();
|
||||
timestampFormats = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null;
|
||||
timestampField = in.readOptionalString();
|
||||
needClientTimezone = in.readBoolean();
|
||||
mappings = Collections.unmodifiableSortedMap(new TreeMap<>(in.readMap()));
|
||||
fieldStats = Collections.unmodifiableSortedMap(new TreeMap<>(in.readMap(StreamInput::readString, FieldStats::new)));
|
||||
explanation = Collections.unmodifiableList(in.readList(StreamInput::readString));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
out.writeVInt(numLinesAnalyzed);
|
||||
out.writeVInt(numMessagesAnalyzed);
|
||||
out.writeString(sampleStart);
|
||||
out.writeString(charset);
|
||||
out.writeOptionalBoolean(hasByteOrderMarker);
|
||||
out.writeEnum(format);
|
||||
out.writeOptionalString(multilineStartPattern);
|
||||
out.writeOptionalString(excludeLinesPattern);
|
||||
if (inputFields == null) {
|
||||
out.writeBoolean(false);
|
||||
} else {
|
||||
out.writeBoolean(true);
|
||||
out.writeCollection(inputFields, StreamOutput::writeString);
|
||||
}
|
||||
out.writeOptionalBoolean(hasHeaderRow);
|
||||
if (delimiter == null) {
|
||||
out.writeBoolean(false);
|
||||
} else {
|
||||
out.writeBoolean(true);
|
||||
out.writeVInt(delimiter);
|
||||
}
|
||||
out.writeOptionalBoolean(shouldTrimFields);
|
||||
out.writeOptionalString(grokPattern);
|
||||
if (timestampFormats == null) {
|
||||
out.writeBoolean(false);
|
||||
} else {
|
||||
out.writeBoolean(true);
|
||||
out.writeCollection(timestampFormats, StreamOutput::writeString);
|
||||
}
|
||||
out.writeOptionalString(timestampField);
|
||||
out.writeBoolean(needClientTimezone);
|
||||
out.writeMap(mappings);
|
||||
out.writeMap(fieldStats, StreamOutput::writeString, (out1, value) -> value.writeTo(out1));
|
||||
out.writeCollection(explanation, StreamOutput::writeString);
|
||||
}
|
||||
|
||||
public int getNumLinesAnalyzed() {
|
||||
return numLinesAnalyzed;
|
||||
}
|
||||
@ -300,7 +365,9 @@ public class FileStructure implements ToXContentObject {
|
||||
}
|
||||
builder.endObject();
|
||||
}
|
||||
builder.field(EXPLANATION.getPreferredName(), explanation);
|
||||
if (params.paramAsBoolean(EXPLAIN, false)) {
|
||||
builder.field(EXPLANATION.getPreferredName(), explanation);
|
||||
}
|
||||
builder.endObject();
|
||||
|
||||
return builder;
|
||||
|
@ -0,0 +1,59 @@
|
||||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.core.ml.action;
|
||||
|
||||
import org.elasticsearch.action.ActionRequestValidationException;
|
||||
import org.elasticsearch.common.bytes.BytesArray;
|
||||
import org.elasticsearch.test.AbstractStreamableTestCase;
|
||||
|
||||
import static org.hamcrest.Matchers.containsString;
|
||||
import static org.hamcrest.Matchers.startsWith;
|
||||
|
||||
public class FindFileStructureActionRequestTests extends AbstractStreamableTestCase<FindFileStructureAction.Request> {
|
||||
|
||||
@Override
|
||||
protected FindFileStructureAction.Request createTestInstance() {
|
||||
|
||||
FindFileStructureAction.Request request = new FindFileStructureAction.Request();
|
||||
|
||||
if (randomBoolean()) {
|
||||
request.setLinesToSample(randomIntBetween(10, 2000));
|
||||
}
|
||||
request.setSample(new BytesArray(randomByteArrayOfLength(randomIntBetween(1000, 20000))));
|
||||
|
||||
return request;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected FindFileStructureAction.Request createBlankInstance() {
|
||||
return new FindFileStructureAction.Request();
|
||||
}
|
||||
|
||||
public void testValidateLinesToSample() {
|
||||
|
||||
FindFileStructureAction.Request request = new FindFileStructureAction.Request();
|
||||
request.setLinesToSample(randomFrom(-1, 0));
|
||||
request.setSample(new BytesArray("foo\n"));
|
||||
|
||||
ActionRequestValidationException e = request.validate();
|
||||
assertNotNull(e);
|
||||
assertThat(e.getMessage(), startsWith("Validation Failed: "));
|
||||
assertThat(e.getMessage(), containsString(" lines_to_sample must be positive if specified"));
|
||||
}
|
||||
|
||||
public void testValidateSample() {
|
||||
|
||||
FindFileStructureAction.Request request = new FindFileStructureAction.Request();
|
||||
if (randomBoolean()) {
|
||||
request.setSample(BytesArray.EMPTY);
|
||||
}
|
||||
|
||||
ActionRequestValidationException e = request.validate();
|
||||
assertNotNull(e);
|
||||
assertThat(e.getMessage(), startsWith("Validation Failed: "));
|
||||
assertThat(e.getMessage(), containsString(" sample must be specified"));
|
||||
}
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.core.ml.action;
|
||||
|
||||
import org.elasticsearch.test.AbstractStreamableTestCase;
|
||||
import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructureTests;
|
||||
|
||||
public class FindFileStructureActionResponseTests extends AbstractStreamableTestCase<FindFileStructureAction.Response> {
|
||||
|
||||
@Override
|
||||
protected FindFileStructureAction.Response createTestInstance() {
|
||||
return new FindFileStructureAction.Response(FileStructureTests.createTestFileStructure());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected FindFileStructureAction.Response createBlankInstance() {
|
||||
return new FindFileStructureAction.Response();
|
||||
}
|
||||
}
|
@ -5,16 +5,18 @@
|
||||
*/
|
||||
package org.elasticsearch.xpack.core.ml.filestructurefinder;
|
||||
|
||||
import org.elasticsearch.common.io.stream.Writeable;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.test.AbstractXContentTestCase;
|
||||
import org.elasticsearch.test.AbstractSerializingTestCase;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class FieldStatsTests extends AbstractXContentTestCase<FieldStats> {
|
||||
public class FieldStatsTests extends AbstractSerializingTestCase<FieldStats> {
|
||||
|
||||
@Override
|
||||
protected FieldStats createTestInstance() {
|
||||
return createTestFieldStats();
|
||||
}
|
||||
@ -51,11 +53,13 @@ public class FieldStatsTests extends AbstractXContentTestCase<FieldStats> {
|
||||
return new FieldStats(count, cardinality, minValue, maxValue, meanValue, medianValue, topHits);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Writeable.Reader<FieldStats> instanceReader() {
|
||||
return FieldStats::new;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected FieldStats doParseInstance(XContentParser parser) {
|
||||
return FieldStats.PARSER.apply(parser, null);
|
||||
}
|
||||
|
||||
protected boolean supportsUnknownFields() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -5,8 +5,10 @@
|
||||
*/
|
||||
package org.elasticsearch.xpack.core.ml.filestructurefinder;
|
||||
|
||||
import org.elasticsearch.common.io.stream.Writeable;
|
||||
import org.elasticsearch.common.xcontent.ToXContent;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.test.AbstractXContentTestCase;
|
||||
import org.elasticsearch.test.AbstractSerializingTestCase;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Arrays;
|
||||
@ -16,9 +18,14 @@ import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
public class FileStructureTests extends AbstractXContentTestCase<FileStructure> {
|
||||
public class FileStructureTests extends AbstractSerializingTestCase<FileStructure> {
|
||||
|
||||
@Override
|
||||
protected FileStructure createTestInstance() {
|
||||
return createTestFileStructure();
|
||||
}
|
||||
|
||||
public static FileStructure createTestFileStructure() {
|
||||
|
||||
FileStructure.Format format = randomFrom(EnumSet.allOf(FileStructure.Format.class));
|
||||
|
||||
@ -66,24 +73,31 @@ public class FileStructureTests extends AbstractXContentTestCase<FileStructure>
|
||||
}
|
||||
builder.setMappings(mappings);
|
||||
|
||||
//if (randomBoolean()) {
|
||||
if (randomBoolean()) {
|
||||
Map<String, FieldStats> fieldStats = new TreeMap<>();
|
||||
for (String field : generateRandomStringArray(5, 20, false, false)) {
|
||||
fieldStats.put(field, FieldStatsTests.createTestFieldStats());
|
||||
}
|
||||
builder.setFieldStats(fieldStats);
|
||||
//}
|
||||
}
|
||||
|
||||
builder.setExplanation(Arrays.asList(generateRandomStringArray(10, 150, false, false)));
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Writeable.Reader<FileStructure> instanceReader() {
|
||||
return FileStructure::new;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected FileStructure doParseInstance(XContentParser parser) {
|
||||
return FileStructure.PARSER.apply(parser, null).build();
|
||||
}
|
||||
|
||||
protected boolean supportsUnknownFields() {
|
||||
return false;
|
||||
@Override
|
||||
protected ToXContent.Params getToXContentParams() {
|
||||
return new ToXContent.MapParams(Collections.singletonMap(FileStructure.EXPLAIN, "true"));
|
||||
}
|
||||
}
|
||||
|
@ -31,10 +31,13 @@ public class MlWithSecurityUserRoleIT extends MlWithSecurityIT {
|
||||
super.test();
|
||||
|
||||
// We should have got here if and only if the only ML endpoints in the test were GETs
|
||||
// or the find_file_structure API, which is also available to the machine_learning_user
|
||||
// role
|
||||
for (ExecutableSection section : testCandidate.getTestSection().getExecutableSections()) {
|
||||
if (section instanceof DoSection) {
|
||||
if (((DoSection) section).getApiCallSection().getApi().startsWith("xpack.ml.") &&
|
||||
((DoSection) section).getApiCallSection().getApi().startsWith("xpack.ml.get_") == false) {
|
||||
((DoSection) section).getApiCallSection().getApi().startsWith("xpack.ml.get_") == false &&
|
||||
((DoSection) section).getApiCallSection().getApi().equals("xpack.ml.find_file_structure") == false) {
|
||||
fail("should have failed because of missing role");
|
||||
}
|
||||
}
|
||||
|
@ -66,6 +66,7 @@ import org.elasticsearch.xpack.core.ml.action.DeleteForecastAction;
|
||||
import org.elasticsearch.xpack.core.ml.action.DeleteJobAction;
|
||||
import org.elasticsearch.xpack.core.ml.action.DeleteModelSnapshotAction;
|
||||
import org.elasticsearch.xpack.core.ml.action.FinalizeJobExecutionAction;
|
||||
import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction;
|
||||
import org.elasticsearch.xpack.core.ml.action.FlushJobAction;
|
||||
import org.elasticsearch.xpack.core.ml.action.ForecastJobAction;
|
||||
import org.elasticsearch.xpack.core.ml.action.GetBucketsAction;
|
||||
@ -119,6 +120,7 @@ import org.elasticsearch.xpack.ml.action.TransportDeleteForecastAction;
|
||||
import org.elasticsearch.xpack.ml.action.TransportDeleteJobAction;
|
||||
import org.elasticsearch.xpack.ml.action.TransportDeleteModelSnapshotAction;
|
||||
import org.elasticsearch.xpack.ml.action.TransportFinalizeJobExecutionAction;
|
||||
import org.elasticsearch.xpack.ml.action.TransportFindFileStructureAction;
|
||||
import org.elasticsearch.xpack.ml.action.TransportFlushJobAction;
|
||||
import org.elasticsearch.xpack.ml.action.TransportForecastJobAction;
|
||||
import org.elasticsearch.xpack.ml.action.TransportGetBucketsAction;
|
||||
@ -180,6 +182,7 @@ import org.elasticsearch.xpack.ml.job.process.normalizer.NormalizerFactory;
|
||||
import org.elasticsearch.xpack.ml.job.process.normalizer.NormalizerProcessFactory;
|
||||
import org.elasticsearch.xpack.ml.notifications.Auditor;
|
||||
import org.elasticsearch.xpack.ml.rest.RestDeleteExpiredDataAction;
|
||||
import org.elasticsearch.xpack.ml.rest.RestFindFileStructureAction;
|
||||
import org.elasticsearch.xpack.ml.rest.RestMlInfoAction;
|
||||
import org.elasticsearch.xpack.ml.rest.calendar.RestDeleteCalendarAction;
|
||||
import org.elasticsearch.xpack.ml.rest.calendar.RestDeleteCalendarEventAction;
|
||||
@ -500,7 +503,8 @@ public class MachineLearning extends Plugin implements ActionPlugin, AnalysisPlu
|
||||
new RestDeleteCalendarJobAction(settings, restController),
|
||||
new RestPutCalendarJobAction(settings, restController),
|
||||
new RestGetCalendarEventsAction(settings, restController),
|
||||
new RestPostCalendarEventAction(settings, restController)
|
||||
new RestPostCalendarEventAction(settings, restController),
|
||||
new RestFindFileStructureAction(settings, restController)
|
||||
);
|
||||
}
|
||||
|
||||
@ -557,7 +561,8 @@ public class MachineLearning extends Plugin implements ActionPlugin, AnalysisPlu
|
||||
new ActionHandler<>(UpdateCalendarJobAction.INSTANCE, TransportUpdateCalendarJobAction.class),
|
||||
new ActionHandler<>(GetCalendarEventsAction.INSTANCE, TransportGetCalendarEventsAction.class),
|
||||
new ActionHandler<>(PostCalendarEventsAction.INSTANCE, TransportPostCalendarEventsAction.class),
|
||||
new ActionHandler<>(PersistJobAction.INSTANCE, TransportPersistJobAction.class)
|
||||
new ActionHandler<>(PersistJobAction.INSTANCE, TransportPersistJobAction.class),
|
||||
new ActionHandler<>(FindFileStructureAction.INSTANCE, TransportFindFileStructureAction.class)
|
||||
);
|
||||
}
|
||||
@Override
|
||||
|
@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.action;
|
||||
|
||||
import org.elasticsearch.action.ActionListener;
|
||||
import org.elasticsearch.action.support.ActionFilters;
|
||||
import org.elasticsearch.action.support.HandledTransportAction;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.tasks.Task;
|
||||
import org.elasticsearch.threadpool.ThreadPool;
|
||||
import org.elasticsearch.transport.TransportService;
|
||||
import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction;
|
||||
import org.elasticsearch.xpack.ml.MachineLearning;
|
||||
import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureFinder;
|
||||
import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureFinderManager;
|
||||
|
||||
public class TransportFindFileStructureAction
|
||||
extends HandledTransportAction<FindFileStructureAction.Request, FindFileStructureAction.Response> {
|
||||
|
||||
private final ThreadPool threadPool;
|
||||
|
||||
@Inject
|
||||
public TransportFindFileStructureAction(Settings settings, TransportService transportService, ActionFilters actionFilters,
|
||||
ThreadPool threadPool) {
|
||||
super(settings, FindFileStructureAction.NAME, transportService, actionFilters, FindFileStructureAction.Request::new);
|
||||
this.threadPool = threadPool;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doExecute(Task task, FindFileStructureAction.Request request,
|
||||
ActionListener<FindFileStructureAction.Response> listener) {
|
||||
|
||||
// As determining the file structure might take a while, we run
|
||||
// in a different thread to avoid blocking the network thread.
|
||||
threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME).execute(() -> {
|
||||
try {
|
||||
listener.onResponse(buildFileStructureResponse(request));
|
||||
} catch (Exception e) {
|
||||
listener.onFailure(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private FindFileStructureAction.Response buildFileStructureResponse(FindFileStructureAction.Request request) throws Exception {
|
||||
|
||||
FileStructureFinderManager structureFinderManager = new FileStructureFinderManager();
|
||||
|
||||
FileStructureFinder fileStructureFinder =
|
||||
structureFinderManager.findFileStructure(request.getLinesToSample(), request.getSample().streamInput());
|
||||
|
||||
return new FindFileStructureAction.Response(fileStructureFinder.getStructure());
|
||||
}
|
||||
}
|
@ -35,6 +35,7 @@ import java.util.Set;
|
||||
public final class FileStructureFinderManager {
|
||||
|
||||
public static final int MIN_SAMPLE_LINE_COUNT = 2;
|
||||
public static final int DEFAULT_IDEAL_SAMPLE_LINE_COUNT = 1000;
|
||||
|
||||
static final Set<String> FILEBEAT_SUPPORTED_ENCODINGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
|
||||
"866", "ansi_x3.4-1968", "arabic", "ascii", "asmo-708", "big5", "big5-hkscs", "chinese", "cn-big5", "cp1250", "cp1251", "cp1252",
|
||||
@ -82,16 +83,18 @@ public final class FileStructureFinderManager {
|
||||
* Given a stream of data from some file, determine its structure.
|
||||
* @param idealSampleLineCount Ideally, how many lines from the stream will be read to determine the structure?
|
||||
* If the stream has fewer lines then an attempt will still be made, providing at
|
||||
* least {@link #MIN_SAMPLE_LINE_COUNT} lines can be read.
|
||||
* least {@link #MIN_SAMPLE_LINE_COUNT} lines can be read. If <code>null</code>
|
||||
* the value of {@link #DEFAULT_IDEAL_SAMPLE_LINE_COUNT} will be used.
|
||||
* @param fromFile A stream from which the sample will be read.
|
||||
* @return A {@link FileStructureFinder} object from which the structure and messages can be queried.
|
||||
* @throws Exception A variety of problems could occur at various stages of the structure finding process.
|
||||
*/
|
||||
public FileStructureFinder findLogStructure(int idealSampleLineCount, InputStream fromFile) throws Exception {
|
||||
return findLogStructure(new ArrayList<>(), idealSampleLineCount, fromFile);
|
||||
public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile) throws Exception {
|
||||
return findFileStructure(new ArrayList<>(), (idealSampleLineCount == null) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount,
|
||||
fromFile);
|
||||
}
|
||||
|
||||
public FileStructureFinder findLogStructure(List<String> explanation, int idealSampleLineCount, InputStream fromFile)
|
||||
public FileStructureFinder findFileStructure(List<String> explanation, int idealSampleLineCount, InputStream fromFile)
|
||||
throws Exception {
|
||||
|
||||
CharsetMatch charsetMatch = findCharset(explanation, fromFile);
|
||||
|
@ -445,7 +445,7 @@ public final class GrokPatternCreator {
|
||||
@Override
|
||||
public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
|
||||
Collection<String> epilogues, Map<String, Object> mappings, Map<String, FieldStats> fieldStats) {
|
||||
String sampleValue = null;
|
||||
Collection<String> values = new ArrayList<>();
|
||||
for (String snippet : snippets) {
|
||||
Map<String, Object> captures = grok.captures(snippet);
|
||||
// If the pattern doesn't match then captures will be null
|
||||
@ -453,22 +453,24 @@ public final class GrokPatternCreator {
|
||||
throw new IllegalStateException("[%{" + grokPatternName + "}] does not match snippet [" + snippet + "]");
|
||||
}
|
||||
prefaces.add(captures.getOrDefault(PREFACE, "").toString());
|
||||
if (sampleValue == null) {
|
||||
sampleValue = captures.get(VALUE).toString();
|
||||
}
|
||||
values.add(captures.getOrDefault(VALUE, "").toString());
|
||||
epilogues.add(captures.getOrDefault(EPILOGUE, "").toString());
|
||||
}
|
||||
String adjustedFieldName = buildFieldName(fieldNameCountStore, fieldName);
|
||||
if (mappings != null) {
|
||||
Map<String, String> fullMappingType = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType);
|
||||
if ("date".equals(mappingType)) {
|
||||
TimestampMatch timestampMatch = TimestampFormatFinder.findFirstFullMatch(sampleValue);
|
||||
assert values.isEmpty() == false;
|
||||
TimestampMatch timestampMatch = TimestampFormatFinder.findFirstFullMatch(values.iterator().next());
|
||||
if (timestampMatch != null) {
|
||||
fullMappingType = timestampMatch.getEsDateMappingTypeWithFormat();
|
||||
}
|
||||
}
|
||||
mappings.put(adjustedFieldName, fullMappingType);
|
||||
}
|
||||
if (fieldStats != null) {
|
||||
fieldStats.put(adjustedFieldName, FileStructureUtils.calculateFieldStats(values));
|
||||
}
|
||||
return "%{" + grokPatternName + ":" + adjustedFieldName + "}";
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,55 @@
|
||||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.rest;
|
||||
|
||||
import org.elasticsearch.ElasticsearchParseException;
|
||||
import org.elasticsearch.client.node.NodeClient;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.rest.BaseRestHandler;
|
||||
import org.elasticsearch.rest.RestController;
|
||||
import org.elasticsearch.rest.RestRequest;
|
||||
import org.elasticsearch.rest.action.RestToXContentListener;
|
||||
import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction;
|
||||
import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
|
||||
import org.elasticsearch.xpack.ml.MachineLearning;
|
||||
import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureFinderManager;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
public class RestFindFileStructureAction extends BaseRestHandler {
|
||||
|
||||
public RestFindFileStructureAction(Settings settings, RestController controller) {
|
||||
super(settings);
|
||||
controller.registerHandler(RestRequest.Method.POST, MachineLearning.BASE_PATH + "find_file_structure", this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "xpack_ml_find_file_structure_action";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected RestChannelConsumer prepareRequest(RestRequest restRequest, NodeClient client) throws IOException {
|
||||
|
||||
FindFileStructureAction.Request request = new FindFileStructureAction.Request();
|
||||
request.setLinesToSample(restRequest.paramAsInt(FindFileStructureAction.Request.LINES_TO_SAMPLE.getPreferredName(),
|
||||
FileStructureFinderManager.DEFAULT_IDEAL_SAMPLE_LINE_COUNT));
|
||||
if (restRequest.hasContent()) {
|
||||
request.setSample(restRequest.content());
|
||||
} else {
|
||||
throw new ElasticsearchParseException("request body is required");
|
||||
}
|
||||
|
||||
return channel -> client.execute(FindFileStructureAction.INSTANCE, request, new RestToXContentListener<>(channel));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Set<String> responseParams() {
|
||||
return Collections.singleton(FileStructure.EXPLAIN);
|
||||
}
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
{
|
||||
"xpack.ml.find_file_structure": {
|
||||
"documentation": "http://www.elastic.co/guide/en/elasticsearch/reference/current/ml-file-structure.html",
|
||||
"methods": [ "POST" ],
|
||||
"url": {
|
||||
"path": "/_xpack/ml/find_file_structure",
|
||||
"paths": [ "/_xpack/ml/find_file_structure" ],
|
||||
"params": {
|
||||
"lines_to_sample": {
|
||||
"type": "int",
|
||||
"description": "Optional parameter to specify how many lines of the file to include in the analysis"
|
||||
},
|
||||
"explain": {
|
||||
"type": "boolean",
|
||||
"description": "Optional parameter to include an commentary on how the structure was derived"
|
||||
}
|
||||
}
|
||||
},
|
||||
"body": {
|
||||
"description" : "The contents of the file to be analyzed",
|
||||
"required" : true,
|
||||
"serialize" : "bulk"
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,44 @@
|
||||
---
|
||||
"Test JSON file structure analysis":
|
||||
- do:
|
||||
headers:
|
||||
# This is to stop the usual content type randomization, which
|
||||
# would obviously ruin the results for this particular test
|
||||
Content-Type: "application/json"
|
||||
xpack.ml.find_file_structure:
|
||||
body:
|
||||
- airline: AAL
|
||||
responsetime: 132.2046
|
||||
sourcetype: file-structure-test
|
||||
time: 1403481600
|
||||
- airline: JZA
|
||||
responsetime: 990.4628
|
||||
sourcetype: file-structure-test
|
||||
time: 1403481700
|
||||
- airline: AAL
|
||||
responsetime: 134.2046
|
||||
sourcetype: file-structure-test
|
||||
time: 1403481800
|
||||
|
||||
- match: { num_lines_analyzed: 3 }
|
||||
- match: { num_messages_analyzed: 3 }
|
||||
- match: { charset: "UTF-8" }
|
||||
- match: { has_byte_order_marker: false }
|
||||
- match: { format: json }
|
||||
- match: { timestamp_field: time }
|
||||
- match: { timestamp_formats.0: UNIX }
|
||||
- match: { need_client_timezone: false }
|
||||
- match: { mappings.airline.type: keyword }
|
||||
- match: { mappings.responsetime.type: double }
|
||||
- match: { mappings.sourcetype.type: keyword }
|
||||
- match: { mappings.time.type: date }
|
||||
- match: { mappings.time.format: epoch_second }
|
||||
- match: { field_stats.airline.count: 3 }
|
||||
- match: { field_stats.airline.cardinality: 2 }
|
||||
- match: { field_stats.responsetime.count: 3 }
|
||||
- match: { field_stats.responsetime.cardinality: 3 }
|
||||
- match: { field_stats.sourcetype.count: 3 }
|
||||
- match: { field_stats.sourcetype.cardinality: 1 }
|
||||
- match: { field_stats.time.count: 3 }
|
||||
- match: { field_stats.time.cardinality: 3 }
|
||||
- match: { field_stats.time.cardinality: 3 }
|
Loading…
x
Reference in New Issue
Block a user