[ML] Rename input_fields to column_names in file structure (#33568)

This change tightens up the meaning of the "input_fields" field
in the file structure finder output.  Previously it was permitted
but not calculated for JSON and XML files.  Following this change
the field is called "column_names" and is only permitted for
delimited files.

Additionally the way the column names are set for headerless
delimited files is refactored to encapsulate the way they're
named to one line of the code rather than having the same
logic in two places.
This commit is contained in:
David Roberts 2018-09-11 08:46:26 +01:00 committed by GitHub
parent ea3fdc90c6
commit 8e05ce567f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 45 additions and 39 deletions

View File

@ -92,7 +92,7 @@ public class FileStructure implements ToXContentObject, Writeable {
static final ParseField STRUCTURE = new ParseField("format");
static final ParseField MULTILINE_START_PATTERN = new ParseField("multiline_start_pattern");
static final ParseField EXCLUDE_LINES_PATTERN = new ParseField("exclude_lines_pattern");
static final ParseField INPUT_FIELDS = new ParseField("input_fields");
static final ParseField COLUMN_NAMES = new ParseField("column_names");
static final ParseField HAS_HEADER_ROW = new ParseField("has_header_row");
static final ParseField DELIMITER = new ParseField("delimiter");
static final ParseField SHOULD_TRIM_FIELDS = new ParseField("should_trim_fields");
@ -115,7 +115,7 @@ public class FileStructure implements ToXContentObject, Writeable {
PARSER.declareString((p, c) -> p.setFormat(Format.fromString(c)), STRUCTURE);
PARSER.declareString(Builder::setMultilineStartPattern, MULTILINE_START_PATTERN);
PARSER.declareString(Builder::setExcludeLinesPattern, EXCLUDE_LINES_PATTERN);
PARSER.declareStringArray(Builder::setInputFields, INPUT_FIELDS);
PARSER.declareStringArray(Builder::setColumnNames, COLUMN_NAMES);
PARSER.declareBoolean(Builder::setHasHeaderRow, HAS_HEADER_ROW);
PARSER.declareString((p, c) -> p.setDelimiter(c.charAt(0)), DELIMITER);
PARSER.declareBoolean(Builder::setShouldTrimFields, SHOULD_TRIM_FIELDS);
@ -142,7 +142,7 @@ public class FileStructure implements ToXContentObject, Writeable {
private final Format format;
private final String multilineStartPattern;
private final String excludeLinesPattern;
private final List<String> inputFields;
private final List<String> columnNames;
private final Boolean hasHeaderRow;
private final Character delimiter;
private final Boolean shouldTrimFields;
@ -155,7 +155,7 @@ public class FileStructure implements ToXContentObject, Writeable {
private final List<String> explanation;
public FileStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampleStart, String charset, Boolean hasByteOrderMarker,
Format format, String multilineStartPattern, String excludeLinesPattern, List<String> inputFields,
Format format, String multilineStartPattern, String excludeLinesPattern, List<String> columnNames,
Boolean hasHeaderRow, Character delimiter, Boolean shouldTrimFields, String grokPattern, String timestampField,
List<String> timestampFormats, boolean needClientTimezone, Map<String, Object> mappings,
Map<String, FieldStats> fieldStats, List<String> explanation) {
@ -168,7 +168,7 @@ public class FileStructure implements ToXContentObject, Writeable {
this.format = Objects.requireNonNull(format);
this.multilineStartPattern = multilineStartPattern;
this.excludeLinesPattern = excludeLinesPattern;
this.inputFields = (inputFields == null) ? null : Collections.unmodifiableList(new ArrayList<>(inputFields));
this.columnNames = (columnNames == null) ? null : Collections.unmodifiableList(new ArrayList<>(columnNames));
this.hasHeaderRow = hasHeaderRow;
this.delimiter = delimiter;
this.shouldTrimFields = shouldTrimFields;
@ -190,7 +190,7 @@ public class FileStructure implements ToXContentObject, Writeable {
format = in.readEnum(Format.class);
multilineStartPattern = in.readOptionalString();
excludeLinesPattern = in.readOptionalString();
inputFields = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null;
columnNames = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null;
hasHeaderRow = in.readOptionalBoolean();
delimiter = in.readBoolean() ? (char) in.readVInt() : null;
shouldTrimFields = in.readOptionalBoolean();
@ -213,11 +213,11 @@ public class FileStructure implements ToXContentObject, Writeable {
out.writeEnum(format);
out.writeOptionalString(multilineStartPattern);
out.writeOptionalString(excludeLinesPattern);
if (inputFields == null) {
if (columnNames == null) {
out.writeBoolean(false);
} else {
out.writeBoolean(true);
out.writeCollection(inputFields, StreamOutput::writeString);
out.writeCollection(columnNames, StreamOutput::writeString);
}
out.writeOptionalBoolean(hasHeaderRow);
if (delimiter == null) {
@ -273,8 +273,8 @@ public class FileStructure implements ToXContentObject, Writeable {
return excludeLinesPattern;
}
public List<String> getInputFields() {
return inputFields;
public List<String> getColumnNames() {
return columnNames;
}
public Boolean getHasHeaderRow() {
@ -335,8 +335,8 @@ public class FileStructure implements ToXContentObject, Writeable {
if (excludeLinesPattern != null && excludeLinesPattern.isEmpty() == false) {
builder.field(EXCLUDE_LINES_PATTERN.getPreferredName(), excludeLinesPattern);
}
if (inputFields != null && inputFields.isEmpty() == false) {
builder.field(INPUT_FIELDS.getPreferredName(), inputFields);
if (columnNames != null && columnNames.isEmpty() == false) {
builder.field(COLUMN_NAMES.getPreferredName(), columnNames);
}
if (hasHeaderRow != null) {
builder.field(HAS_HEADER_ROW.getPreferredName(), hasHeaderRow.booleanValue());
@ -377,7 +377,7 @@ public class FileStructure implements ToXContentObject, Writeable {
public int hashCode() {
return Objects.hash(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
multilineStartPattern, excludeLinesPattern, inputFields, hasHeaderRow, delimiter, shouldTrimFields, grokPattern, timestampField,
multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, shouldTrimFields, grokPattern, timestampField,
timestampFormats, needClientTimezone, mappings, fieldStats, explanation);
}
@ -402,7 +402,7 @@ public class FileStructure implements ToXContentObject, Writeable {
Objects.equals(this.format, that.format) &&
Objects.equals(this.multilineStartPattern, that.multilineStartPattern) &&
Objects.equals(this.excludeLinesPattern, that.excludeLinesPattern) &&
Objects.equals(this.inputFields, that.inputFields) &&
Objects.equals(this.columnNames, that.columnNames) &&
Objects.equals(this.hasHeaderRow, that.hasHeaderRow) &&
Objects.equals(this.delimiter, that.delimiter) &&
Objects.equals(this.shouldTrimFields, that.shouldTrimFields) &&
@ -424,7 +424,7 @@ public class FileStructure implements ToXContentObject, Writeable {
private Format format;
private String multilineStartPattern;
private String excludeLinesPattern;
private List<String> inputFields;
private List<String> columnNames;
private Boolean hasHeaderRow;
private Character delimiter;
private Boolean shouldTrimFields;
@ -484,8 +484,8 @@ public class FileStructure implements ToXContentObject, Writeable {
return this;
}
public Builder setInputFields(List<String> inputFields) {
this.inputFields = inputFields;
public Builder setColumnNames(List<String> columnNames) {
this.columnNames = columnNames;
return this;
}
@ -573,6 +573,9 @@ public class FileStructure implements ToXContentObject, Writeable {
}
// $FALL-THROUGH$
case XML:
if (columnNames != null) {
throw new IllegalArgumentException("Column names may not be specified for [" + format + "] structures.");
}
if (hasHeaderRow != null) {
throw new IllegalArgumentException("Has header row may not be specified for [" + format + "] structures.");
}
@ -584,8 +587,8 @@ public class FileStructure implements ToXContentObject, Writeable {
}
break;
case DELIMITED:
if (inputFields == null || inputFields.isEmpty()) {
throw new IllegalArgumentException("Input fields must be specified for [" + format + "] structures.");
if (columnNames == null || columnNames.isEmpty()) {
throw new IllegalArgumentException("Column names must be specified for [" + format + "] structures.");
}
if (hasHeaderRow == null) {
throw new IllegalArgumentException("Has header row must be specified for [" + format + "] structures.");
@ -598,8 +601,8 @@ public class FileStructure implements ToXContentObject, Writeable {
}
break;
case SEMI_STRUCTURED_TEXT:
if (inputFields != null) {
throw new IllegalArgumentException("Input fields may not be specified for [" + format + "] structures.");
if (columnNames != null) {
throw new IllegalArgumentException("Column names may not be specified for [" + format + "] structures.");
}
if (hasHeaderRow != null) {
throw new IllegalArgumentException("Has header row may not be specified for [" + format + "] structures.");
@ -635,7 +638,7 @@ public class FileStructure implements ToXContentObject, Writeable {
}
return new FileStructure(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
multilineStartPattern, excludeLinesPattern, inputFields, hasHeaderRow, delimiter, shouldTrimFields, grokPattern,
multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, shouldTrimFields, grokPattern,
timestampField, timestampFormats, needClientTimezone, mappings, fieldStats, explanation);
}
}

View File

@ -50,18 +50,17 @@ public class FileStructureTests extends AbstractSerializingTestCase<FileStructur
builder.setExcludeLinesPattern(randomAlphaOfLength(100));
}
if (format == FileStructure.Format.DELIMITED || (format.supportsNesting() && randomBoolean())) {
builder.setInputFields(Arrays.asList(generateRandomStringArray(10, 10, false, false)));
}
if (format == FileStructure.Format.DELIMITED) {
builder.setColumnNames(Arrays.asList(generateRandomStringArray(10, 10, false, false)));
builder.setHasHeaderRow(randomBoolean());
builder.setDelimiter(randomFrom(',', '\t', ';', '|'));
}
if (format.isSemiStructured()) {
if (format == FileStructure.Format.SEMI_STRUCTURED_TEXT) {
builder.setGrokPattern(randomAlphaOfLength(100));
}
if (format.isSemiStructured() || randomBoolean()) {
if (format == FileStructure.Format.SEMI_STRUCTURED_TEXT || randomBoolean()) {
builder.setTimestampField(randomAlphaOfLength(10));
builder.setTimestampFormats(Arrays.asList(generateRandomStringArray(3, 20, false, false)));
builder.setNeedClientTimezone(randomBoolean());

View File

@ -49,10 +49,12 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
Tuple<Boolean, String[]> headerInfo = findHeaderFromSample(explanation, rows);
boolean isHeaderInFile = headerInfo.v1();
String[] header = headerInfo.v2();
String[] headerWithNamedBlanks = new String[header.length];
// The column names are the header names but with blanks named column1, column2, etc.
String[] columnNames = new String[header.length];
for (int i = 0; i < header.length; ++i) {
String rawHeader = header[i].isEmpty() ? "column" + (i + 1) : header[i];
headerWithNamedBlanks[i] = trimFields ? rawHeader.trim() : rawHeader;
assert header[i] != null;
String rawHeader = trimFields ? header[i].trim() : header[i];
columnNames[i] = rawHeader.isEmpty() ? "column" + (i + 1) : rawHeader;
}
List<String> sampleLines = Arrays.asList(sample.split("\n"));
@ -63,7 +65,7 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
List<String> row = rows.get(index);
int lineNumber = lineNumbers.get(index);
Map<String, String> sampleRecord = new LinkedHashMap<>();
Util.filterListToMap(sampleRecord, headerWithNamedBlanks,
Util.filterListToMap(sampleRecord, columnNames,
trimFields ? row.stream().map(String::trim).collect(Collectors.toList()) : row);
sampleRecords.add(sampleRecord);
sampleMessages.add(
@ -82,7 +84,7 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
.setNumMessagesAnalyzed(sampleRecords.size())
.setHasHeaderRow(isHeaderInFile)
.setDelimiter(delimiter)
.setInputFields(Arrays.stream(headerWithNamedBlanks).collect(Collectors.toList()));
.setColumnNames(Arrays.stream(columnNames).collect(Collectors.toList()));
if (trimFields) {
structureBuilder.setShouldTrimFields(true);
@ -225,7 +227,9 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
// SuperCSV will put nulls in the header if any columns don't have names, but empty strings are better for us
return new Tuple<>(true, firstRow.stream().map(field -> (field == null) ? "" : field).toArray(String[]::new));
} else {
return new Tuple<>(false, IntStream.rangeClosed(1, firstRow.size()).mapToObj(num -> "column" + num).toArray(String[]::new));
String[] dummyHeader = new String[firstRow.size()];
Arrays.fill(dummyHeader, "");
return new Tuple<>(false, dummyHeader);
}
}

View File

@ -45,7 +45,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
assertEquals(Character.valueOf(','), structure.getDelimiter());
assertTrue(structure.getHasHeaderRow());
assertNull(structure.getShouldTrimFields());
assertEquals(Arrays.asList("time", "message"), structure.getInputFields());
assertEquals(Arrays.asList("time", "message"), structure.getColumnNames());
assertNull(structure.getGrokPattern());
assertEquals("time", structure.getTimestampField());
assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
@ -76,7 +76,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
assertEquals(Character.valueOf(','), structure.getDelimiter());
assertTrue(structure.getHasHeaderRow());
assertNull(structure.getShouldTrimFields());
assertEquals(Arrays.asList("message", "time", "count"), structure.getInputFields());
assertEquals(Arrays.asList("message", "time", "count"), structure.getColumnNames());
assertNull(structure.getGrokPattern());
assertEquals("time", structure.getTimestampField());
assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
@ -114,7 +114,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
assertNull(structure.getShouldTrimFields());
assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance",
"RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax",
"tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "column18", "column19"), structure.getInputFields());
"tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "column18", "column19"), structure.getColumnNames());
assertNull(structure.getGrokPattern());
assertEquals("tpep_pickup_datetime", structure.getTimestampField());
assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
@ -152,7 +152,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
assertNull(structure.getShouldTrimFields());
assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance",
"RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax",
"tip_amount", "tolls_amount", "improvement_surcharge", "total_amount"), structure.getInputFields());
"tip_amount", "tolls_amount", "improvement_surcharge", "total_amount"), structure.getColumnNames());
assertNull(structure.getGrokPattern());
assertEquals("tpep_pickup_datetime", structure.getTimestampField());
assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
@ -183,7 +183,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
assertEquals(Character.valueOf(','), structure.getDelimiter());
assertTrue(structure.getHasHeaderRow());
assertNull(structure.getShouldTrimFields());
assertEquals(Arrays.asList("pos_id", "trip_id", "latitude", "longitude", "altitude", "timestamp"), structure.getInputFields());
assertEquals(Arrays.asList("pos_id", "trip_id", "latitude", "longitude", "altitude", "timestamp"), structure.getColumnNames());
assertNull(structure.getGrokPattern());
assertEquals("timestamp", structure.getTimestampField());
assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss.SSSSSS"), structure.getTimestampFormats());
@ -213,7 +213,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
DelimitedFileStructureFinder.readRows(withoutHeader, CsvPreference.EXCEL_PREFERENCE).v1());
assertFalse(header.v1());
assertThat(header.v2(), arrayContaining("column1", "column2", "column3", "column4"));
assertThat(header.v2(), arrayContaining("", "", "", ""));
}
public void testLevenshteinDistance() {