[ML] Change default data_description to JSON (elastic/x-pack-elasticsearch#707)

relates elastic/x-pack-elasticsearch#674

Original commit: elastic/x-pack-elasticsearch@efc5f4a269
This commit is contained in:
Dimitris Athanasiou 2017-03-16 11:48:57 +00:00 committed by GitHub
parent 803654d9bf
commit 6c9727c2db
7 changed files with 84 additions and 38 deletions

View File

@ -123,11 +123,10 @@ public class DataDescription extends ToXContentToBytes implements Writeable {
private final DataFormat dataFormat; private final DataFormat dataFormat;
private final String timeFieldName; private final String timeFieldName;
private final String timeFormat; private final String timeFormat;
private final char fieldDelimiter; private final Character fieldDelimiter;
private final char quoteCharacter; private final Character quoteCharacter;
public static final ObjectParser<Builder, Void> PARSER = public static final ObjectParser<Builder, Void> PARSER = new ObjectParser<>(DATA_DESCRIPTION_FIELD.getPreferredName(), Builder::new);
new ObjectParser<>(DATA_DESCRIPTION_FIELD.getPreferredName(), Builder::new);
static { static {
PARSER.declareString(Builder::setFormat, FORMAT_FIELD); PARSER.declareString(Builder::setFormat, FORMAT_FIELD);
@ -137,7 +136,8 @@ public class DataDescription extends ToXContentToBytes implements Writeable {
PARSER.declareField(Builder::setQuoteCharacter, DataDescription::extractChar, QUOTE_CHARACTER_FIELD, ValueType.STRING); PARSER.declareField(Builder::setQuoteCharacter, DataDescription::extractChar, QUOTE_CHARACTER_FIELD, ValueType.STRING);
} }
public DataDescription(DataFormat dataFormat, String timeFieldName, String timeFormat, char fieldDelimiter, char quoteCharacter) { public DataDescription(DataFormat dataFormat, String timeFieldName, String timeFormat, Character fieldDelimiter,
Character quoteCharacter) {
this.dataFormat = dataFormat; this.dataFormat = dataFormat;
this.timeFieldName = timeFieldName; this.timeFieldName = timeFieldName;
this.timeFormat = timeFormat; this.timeFormat = timeFormat;
@ -149,8 +149,8 @@ public class DataDescription extends ToXContentToBytes implements Writeable {
dataFormat = DataFormat.readFromStream(in); dataFormat = DataFormat.readFromStream(in);
timeFieldName = in.readString(); timeFieldName = in.readString();
timeFormat = in.readString(); timeFormat = in.readString();
fieldDelimiter = (char) in.read(); fieldDelimiter = in.readBoolean() ? (char) in.read() : null;
quoteCharacter = (char) in.read(); quoteCharacter = in.readBoolean() ? (char) in.read() : null;
} }
@Override @Override
@ -158,25 +158,41 @@ public class DataDescription extends ToXContentToBytes implements Writeable {
dataFormat.writeTo(out); dataFormat.writeTo(out);
out.writeString(timeFieldName); out.writeString(timeFieldName);
out.writeString(timeFormat); out.writeString(timeFormat);
out.write(fieldDelimiter); if (fieldDelimiter != null) {
out.write(quoteCharacter); out.writeBoolean(true);
out.write(fieldDelimiter);
} else {
out.writeBoolean(false);
}
if (quoteCharacter != null) {
out.writeBoolean(true);
out.write(quoteCharacter);
} else {
out.writeBoolean(false);
}
} }
@Override @Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(); builder.startObject();
builder.field(FORMAT_FIELD.getPreferredName(), dataFormat); if (dataFormat != DataFormat.JSON) {
builder.field(FORMAT_FIELD.getPreferredName(), dataFormat);
}
builder.field(TIME_FIELD_NAME_FIELD.getPreferredName(), timeFieldName); builder.field(TIME_FIELD_NAME_FIELD.getPreferredName(), timeFieldName);
builder.field(TIME_FORMAT_FIELD.getPreferredName(), timeFormat); builder.field(TIME_FORMAT_FIELD.getPreferredName(), timeFormat);
builder.field(FIELD_DELIMITER_FIELD.getPreferredName(), String.valueOf(fieldDelimiter)); if (fieldDelimiter != null) {
builder.field(QUOTE_CHARACTER_FIELD.getPreferredName(), String.valueOf(quoteCharacter)); builder.field(FIELD_DELIMITER_FIELD.getPreferredName(), String.valueOf(fieldDelimiter));
}
if (quoteCharacter != null) {
builder.field(QUOTE_CHARACTER_FIELD.getPreferredName(), String.valueOf(quoteCharacter));
}
builder.endObject(); builder.endObject();
return builder; return builder;
} }
/** /**
* The format of the data to be processed. * The format of the data to be processed.
* Defaults to {@link DataDescription.DataFormat#DELIMITED} * Defaults to {@link DataDescription.DataFormat#JSON}
* *
* @return The data format * @return The data format
*/ */
@ -196,8 +212,8 @@ public class DataDescription extends ToXContentToBytes implements Writeable {
/** /**
* Either {@value #EPOCH}, {@value #EPOCH_MS} or a SimpleDateTime format string. * Either {@value #EPOCH}, {@value #EPOCH_MS} or a SimpleDateTime format string.
* If not set (is <code>null</code> or an empty string) or set to * If not set (is <code>null</code> or an empty string) or set to
* {@value #EPOCH} (the default) then the date is assumed to be in * {@value #EPOCH_MS} (the default) then the date is assumed to be in
* seconds from the epoch. * milliseconds from the epoch.
* *
* @return A String if set or <code>null</code> * @return A String if set or <code>null</code>
*/ */
@ -209,21 +225,21 @@ public class DataDescription extends ToXContentToBytes implements Writeable {
* If the data is in a delineated format with a header e.g. csv or tsv * If the data is in a delineated format with a header e.g. csv or tsv
* this is the delimiter character used. This is only applicable if * this is the delimiter character used. This is only applicable if
* {@linkplain #getFormat()} is {@link DataDescription.DataFormat#DELIMITED}. * {@linkplain #getFormat()} is {@link DataDescription.DataFormat#DELIMITED}.
* The default value is {@value #DEFAULT_DELIMITER} * The default value for delimited format is {@value #DEFAULT_DELIMITER}.
* *
* @return A char * @return A char
*/ */
public char getFieldDelimiter() { public Character getFieldDelimiter() {
return fieldDelimiter; return fieldDelimiter;
} }
/** /**
* The quote character used in delineated formats. * The quote character used in delineated formats.
* Defaults to {@value #DEFAULT_QUOTE_CHAR} * The default value for delimited format is {@value #DEFAULT_QUOTE_CHAR}.
* *
* @return The delineated format quote character * @return The delineated format quote character
*/ */
public char getQuoteCharacter() { public Character getQuoteCharacter() {
return quoteCharacter; return quoteCharacter;
} }
@ -236,8 +252,7 @@ public class DataDescription extends ToXContentToBytes implements Writeable {
* @return True if the data should be transformed. * @return True if the data should be transformed.
*/ */
public boolean transform() { public boolean transform() {
return dataFormat == DataFormat.JSON || return dataFormat == DataFormat.JSON || isTransformTime();
isTransformTime();
} }
/** /**
@ -260,7 +275,7 @@ public class DataDescription extends ToXContentToBytes implements Writeable {
return EPOCH_MS.equals(timeFormat); return EPOCH_MS.equals(timeFormat);
} }
private static char extractChar(XContentParser parser) throws IOException { private static Character extractChar(XContentParser parser) throws IOException {
if (parser.currentToken() == XContentParser.Token.VALUE_STRING) { if (parser.currentToken() == XContentParser.Token.VALUE_STRING) {
String charStr = parser.text(); String charStr = parser.text();
if (charStr.length() != 1) { if (charStr.length() != 1) {
@ -287,7 +302,7 @@ public class DataDescription extends ToXContentToBytes implements Writeable {
DataDescription that = (DataDescription) other; DataDescription that = (DataDescription) other;
return this.dataFormat == that.dataFormat && return this.dataFormat == that.dataFormat &&
this.quoteCharacter == that.quoteCharacter && Objects.equals(this.quoteCharacter, that.quoteCharacter) &&
Objects.equals(this.timeFieldName, that.timeFieldName) && Objects.equals(this.timeFieldName, that.timeFieldName) &&
Objects.equals(this.timeFormat, that.timeFormat) && Objects.equals(this.timeFormat, that.timeFormat) &&
Objects.equals(this.fieldDelimiter, that.fieldDelimiter); Objects.equals(this.fieldDelimiter, that.fieldDelimiter);
@ -295,17 +310,16 @@ public class DataDescription extends ToXContentToBytes implements Writeable {
@Override @Override
public int hashCode() { public int hashCode() {
return Objects.hash(dataFormat, quoteCharacter, timeFieldName, return Objects.hash(dataFormat, quoteCharacter, timeFieldName, timeFormat, fieldDelimiter);
timeFormat, fieldDelimiter);
} }
public static class Builder { public static class Builder {
private DataFormat dataFormat = DataFormat.DELIMITED; private DataFormat dataFormat = DataFormat.JSON;
private String timeFieldName = DEFAULT_TIME_FIELD; private String timeFieldName = DEFAULT_TIME_FIELD;
private String timeFormat = EPOCH; private String timeFormat = EPOCH_MS;
private char fieldDelimiter = DEFAULT_DELIMITER; private Character fieldDelimiter;
private char quoteCharacter = DEFAULT_QUOTE_CHAR; private Character quoteCharacter;
public void setFormat(DataFormat format) { public void setFormat(DataFormat format) {
dataFormat = ExceptionsHelper.requireNonNull(format, FORMAT_FIELD.getPreferredName() + " must not be null"); dataFormat = ExceptionsHelper.requireNonNull(format, FORMAT_FIELD.getPreferredName() + " must not be null");
@ -335,18 +349,24 @@ public class DataDescription extends ToXContentToBytes implements Writeable {
timeFormat = format; timeFormat = format;
} }
public void setFieldDelimiter(char delimiter) { public void setFieldDelimiter(Character delimiter) {
fieldDelimiter = delimiter; fieldDelimiter = delimiter;
} }
public void setQuoteCharacter(char value) { public void setQuoteCharacter(Character value) {
quoteCharacter = value; quoteCharacter = value;
} }
public DataDescription build() { public DataDescription build() {
return new DataDescription(dataFormat, timeFieldName, timeFormat, fieldDelimiter,quoteCharacter); if (dataFormat == DataFormat.DELIMITED) {
if (fieldDelimiter == null) {
fieldDelimiter = DEFAULT_DELIMITER;
}
if (quoteCharacter == null) {
quoteCharacter = DEFAULT_QUOTE_CHAR;
}
}
return new DataDescription(dataFormat, timeFieldName, timeFormat, fieldDelimiter, quoteCharacter);
} }
} }
} }

View File

@ -57,10 +57,10 @@ public class ValidateJobConfigActionRequestTests extends AbstractStreamableXCont
if (randomBoolean()) { if (randomBoolean()) {
dataDescription.setFormat(DataDescription.DataFormat.DELIMITED); dataDescription.setFormat(DataDescription.DataFormat.DELIMITED);
if (randomBoolean()) { if (randomBoolean()) {
dataDescription.setFieldDelimiter(';'); dataDescription.setFieldDelimiter(new Character(';'));
} }
if (randomBoolean()) { if (randomBoolean()) {
dataDescription.setQuoteCharacter('\''); dataDescription.setQuoteCharacter(new Character('\''));
} }
} else { } else {
dataDescription.setFormat(DataDescription.DataFormat.JSON); dataDescription.setFormat(DataDescription.DataFormat.JSON);

View File

@ -16,10 +16,34 @@ import org.elasticsearch.xpack.ml.job.messages.Messages;
import org.elasticsearch.xpack.ml.support.AbstractSerializingTestCase; import org.elasticsearch.xpack.ml.support.AbstractSerializingTestCase;
import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.nullValue;
import static org.hamcrest.core.Is.is;
public class DataDescriptionTests extends AbstractSerializingTestCase<DataDescription> { public class DataDescriptionTests extends AbstractSerializingTestCase<DataDescription> {
public void testDefault() {
DataDescription dataDescription = new DataDescription.Builder().build();
assertThat(dataDescription.getFormat(), equalTo(DataFormat.JSON));
assertThat(dataDescription.getTimeField(), equalTo("time"));
assertThat(dataDescription.getTimeFormat(), equalTo("epoch_ms"));
assertThat(dataDescription.getFieldDelimiter(), is(nullValue()));
assertThat(dataDescription.getQuoteCharacter(), is(nullValue()));
}
public void testDefaultDelimited() {
DataDescription.Builder dataDescriptionBuilder = new DataDescription.Builder();
dataDescriptionBuilder.setFormat(DataFormat.DELIMITED);
DataDescription dataDescription = dataDescriptionBuilder.build();
assertThat(dataDescription.getFormat(), equalTo(DataFormat.DELIMITED));
assertThat(dataDescription.getTimeField(), equalTo("time"));
assertThat(dataDescription.getTimeFormat(), equalTo("epoch_ms"));
assertThat(dataDescription.getFieldDelimiter(), is('\t'));
assertThat(dataDescription.getQuoteCharacter(), is('"'));
}
public void testVerify_GivenValidFormat() { public void testVerify_GivenValidFormat() {
DataDescription.Builder description = new DataDescription.Builder(); DataDescription.Builder description = new DataDescription.Builder();
description.setTimeFormat("epoch"); description.setTimeFormat("epoch");

View File

@ -46,6 +46,7 @@ public class ProcessCtrlTests extends ESTestCase {
job.setAnalysisConfig(acBuilder); job.setAnalysisConfig(acBuilder);
DataDescription.Builder dd = new DataDescription.Builder(); DataDescription.Builder dd = new DataDescription.Builder();
dd.setFormat(DataDescription.DataFormat.DELIMITED);
dd.setFieldDelimiter('|'); dd.setFieldDelimiter('|');
dd.setTimeField("tf"); dd.setTimeField("tf");
job.setDataDescription(dd); job.setDataDescription(dd);

View File

@ -356,6 +356,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
private Job createJobDetails(String jobId) { private Job createJobDetails(String jobId) {
DataDescription.Builder dd = new DataDescription.Builder(); DataDescription.Builder dd = new DataDescription.Builder();
dd.setFormat(DataDescription.DataFormat.DELIMITED); dd.setFormat(DataDescription.DataFormat.DELIMITED);
dd.setTimeFormat("epoch");
dd.setFieldDelimiter(','); dd.setFieldDelimiter(',');
Detector d = new Detector.Builder("metric", "value").build(); Detector d = new Detector.Builder("metric", "value").build();

View File

@ -62,8 +62,8 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
}).when(autodetectProcess).writeRecord(any(String[].class)); }).when(autodetectProcess).writeRecord(any(String[].class));
dataDescription = new DataDescription.Builder(); dataDescription = new DataDescription.Builder();
dataDescription.setFieldDelimiter(',');
dataDescription.setFormat(DataFormat.DELIMITED); dataDescription.setFormat(DataFormat.DELIMITED);
dataDescription.setFieldDelimiter(',');
dataDescription.setTimeFormat(DataDescription.EPOCH); dataDescription.setTimeFormat(DataDescription.EPOCH);
Detector detector = new Detector.Builder("metric", "value").build(); Detector detector = new Detector.Builder("metric", "value").build();

View File

@ -31,7 +31,7 @@ public class CsvParserTests extends ESTestCase {
InputStream inputStream = new ByteArrayInputStream(data.getBytes(StandardCharsets.UTF_8)); InputStream inputStream = new ByteArrayInputStream(data.getBytes(StandardCharsets.UTF_8));
CsvPreference csvPref = new CsvPreference.Builder( CsvPreference csvPref = new CsvPreference.Builder(
DataDescription.DEFAULT_QUOTE_CHAR, '"',
',', ',',
new String(new char[]{DataDescription.LINE_ENDING})).build(); new String(new char[]{DataDescription.LINE_ENDING})).build();