diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java index 5484f9f9902..dd508dfb36b 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java @@ -92,7 +92,7 @@ public class FileStructure implements ToXContentObject, Writeable { static final ParseField STRUCTURE = new ParseField("format"); static final ParseField MULTILINE_START_PATTERN = new ParseField("multiline_start_pattern"); static final ParseField EXCLUDE_LINES_PATTERN = new ParseField("exclude_lines_pattern"); - static final ParseField INPUT_FIELDS = new ParseField("input_fields"); + static final ParseField COLUMN_NAMES = new ParseField("column_names"); static final ParseField HAS_HEADER_ROW = new ParseField("has_header_row"); static final ParseField DELIMITER = new ParseField("delimiter"); static final ParseField SHOULD_TRIM_FIELDS = new ParseField("should_trim_fields"); @@ -115,7 +115,7 @@ public class FileStructure implements ToXContentObject, Writeable { PARSER.declareString((p, c) -> p.setFormat(Format.fromString(c)), STRUCTURE); PARSER.declareString(Builder::setMultilineStartPattern, MULTILINE_START_PATTERN); PARSER.declareString(Builder::setExcludeLinesPattern, EXCLUDE_LINES_PATTERN); - PARSER.declareStringArray(Builder::setInputFields, INPUT_FIELDS); + PARSER.declareStringArray(Builder::setColumnNames, COLUMN_NAMES); PARSER.declareBoolean(Builder::setHasHeaderRow, HAS_HEADER_ROW); PARSER.declareString((p, c) -> p.setDelimiter(c.charAt(0)), DELIMITER); PARSER.declareBoolean(Builder::setShouldTrimFields, SHOULD_TRIM_FIELDS); @@ -142,7 +142,7 @@ public class FileStructure implements ToXContentObject, Writeable { private final Format format; private final String multilineStartPattern; private final String excludeLinesPattern; - private final List inputFields; + private final List columnNames; private final Boolean hasHeaderRow; private final Character delimiter; private final Boolean shouldTrimFields; @@ -155,7 +155,7 @@ public class FileStructure implements ToXContentObject, Writeable { private final List explanation; public FileStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampleStart, String charset, Boolean hasByteOrderMarker, - Format format, String multilineStartPattern, String excludeLinesPattern, List inputFields, + Format format, String multilineStartPattern, String excludeLinesPattern, List columnNames, Boolean hasHeaderRow, Character delimiter, Boolean shouldTrimFields, String grokPattern, String timestampField, List timestampFormats, boolean needClientTimezone, Map mappings, Map fieldStats, List explanation) { @@ -168,7 +168,7 @@ public class FileStructure implements ToXContentObject, Writeable { this.format = Objects.requireNonNull(format); this.multilineStartPattern = multilineStartPattern; this.excludeLinesPattern = excludeLinesPattern; - this.inputFields = (inputFields == null) ? null : Collections.unmodifiableList(new ArrayList<>(inputFields)); + this.columnNames = (columnNames == null) ? null : Collections.unmodifiableList(new ArrayList<>(columnNames)); this.hasHeaderRow = hasHeaderRow; this.delimiter = delimiter; this.shouldTrimFields = shouldTrimFields; @@ -190,7 +190,7 @@ public class FileStructure implements ToXContentObject, Writeable { format = in.readEnum(Format.class); multilineStartPattern = in.readOptionalString(); excludeLinesPattern = in.readOptionalString(); - inputFields = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null; + columnNames = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null; hasHeaderRow = in.readOptionalBoolean(); delimiter = in.readBoolean() ? (char) in.readVInt() : null; shouldTrimFields = in.readOptionalBoolean(); @@ -213,11 +213,11 @@ public class FileStructure implements ToXContentObject, Writeable { out.writeEnum(format); out.writeOptionalString(multilineStartPattern); out.writeOptionalString(excludeLinesPattern); - if (inputFields == null) { + if (columnNames == null) { out.writeBoolean(false); } else { out.writeBoolean(true); - out.writeCollection(inputFields, StreamOutput::writeString); + out.writeCollection(columnNames, StreamOutput::writeString); } out.writeOptionalBoolean(hasHeaderRow); if (delimiter == null) { @@ -273,8 +273,8 @@ public class FileStructure implements ToXContentObject, Writeable { return excludeLinesPattern; } - public List getInputFields() { - return inputFields; + public List getColumnNames() { + return columnNames; } public Boolean getHasHeaderRow() { @@ -335,8 +335,8 @@ public class FileStructure implements ToXContentObject, Writeable { if (excludeLinesPattern != null && excludeLinesPattern.isEmpty() == false) { builder.field(EXCLUDE_LINES_PATTERN.getPreferredName(), excludeLinesPattern); } - if (inputFields != null && inputFields.isEmpty() == false) { - builder.field(INPUT_FIELDS.getPreferredName(), inputFields); + if (columnNames != null && columnNames.isEmpty() == false) { + builder.field(COLUMN_NAMES.getPreferredName(), columnNames); } if (hasHeaderRow != null) { builder.field(HAS_HEADER_ROW.getPreferredName(), hasHeaderRow.booleanValue()); @@ -377,7 +377,7 @@ public class FileStructure implements ToXContentObject, Writeable { public int hashCode() { return Objects.hash(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format, - multilineStartPattern, excludeLinesPattern, inputFields, hasHeaderRow, delimiter, shouldTrimFields, grokPattern, timestampField, + multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, shouldTrimFields, grokPattern, timestampField, timestampFormats, needClientTimezone, mappings, fieldStats, explanation); } @@ -402,7 +402,7 @@ public class FileStructure implements ToXContentObject, Writeable { Objects.equals(this.format, that.format) && Objects.equals(this.multilineStartPattern, that.multilineStartPattern) && Objects.equals(this.excludeLinesPattern, that.excludeLinesPattern) && - Objects.equals(this.inputFields, that.inputFields) && + Objects.equals(this.columnNames, that.columnNames) && Objects.equals(this.hasHeaderRow, that.hasHeaderRow) && Objects.equals(this.delimiter, that.delimiter) && Objects.equals(this.shouldTrimFields, that.shouldTrimFields) && @@ -424,7 +424,7 @@ public class FileStructure implements ToXContentObject, Writeable { private Format format; private String multilineStartPattern; private String excludeLinesPattern; - private List inputFields; + private List columnNames; private Boolean hasHeaderRow; private Character delimiter; private Boolean shouldTrimFields; @@ -484,8 +484,8 @@ public class FileStructure implements ToXContentObject, Writeable { return this; } - public Builder setInputFields(List inputFields) { - this.inputFields = inputFields; + public Builder setColumnNames(List columnNames) { + this.columnNames = columnNames; return this; } @@ -573,6 +573,9 @@ public class FileStructure implements ToXContentObject, Writeable { } // $FALL-THROUGH$ case XML: + if (columnNames != null) { + throw new IllegalArgumentException("Column names may not be specified for [" + format + "] structures."); + } if (hasHeaderRow != null) { throw new IllegalArgumentException("Has header row may not be specified for [" + format + "] structures."); } @@ -584,8 +587,8 @@ public class FileStructure implements ToXContentObject, Writeable { } break; case DELIMITED: - if (inputFields == null || inputFields.isEmpty()) { - throw new IllegalArgumentException("Input fields must be specified for [" + format + "] structures."); + if (columnNames == null || columnNames.isEmpty()) { + throw new IllegalArgumentException("Column names must be specified for [" + format + "] structures."); } if (hasHeaderRow == null) { throw new IllegalArgumentException("Has header row must be specified for [" + format + "] structures."); @@ -598,8 +601,8 @@ public class FileStructure implements ToXContentObject, Writeable { } break; case SEMI_STRUCTURED_TEXT: - if (inputFields != null) { - throw new IllegalArgumentException("Input fields may not be specified for [" + format + "] structures."); + if (columnNames != null) { + throw new IllegalArgumentException("Column names may not be specified for [" + format + "] structures."); } if (hasHeaderRow != null) { throw new IllegalArgumentException("Has header row may not be specified for [" + format + "] structures."); @@ -635,7 +638,7 @@ public class FileStructure implements ToXContentObject, Writeable { } return new FileStructure(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format, - multilineStartPattern, excludeLinesPattern, inputFields, hasHeaderRow, delimiter, shouldTrimFields, grokPattern, + multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, shouldTrimFields, grokPattern, timestampField, timestampFormats, needClientTimezone, mappings, fieldStats, explanation); } } diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java index 6dcf6751965..e09b9e3f91e 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java @@ -50,18 +50,17 @@ public class FileStructureTests extends AbstractSerializingTestCase headerInfo = findHeaderFromSample(explanation, rows); boolean isHeaderInFile = headerInfo.v1(); String[] header = headerInfo.v2(); - String[] headerWithNamedBlanks = new String[header.length]; + // The column names are the header names but with blanks named column1, column2, etc. + String[] columnNames = new String[header.length]; for (int i = 0; i < header.length; ++i) { - String rawHeader = header[i].isEmpty() ? "column" + (i + 1) : header[i]; - headerWithNamedBlanks[i] = trimFields ? rawHeader.trim() : rawHeader; + assert header[i] != null; + String rawHeader = trimFields ? header[i].trim() : header[i]; + columnNames[i] = rawHeader.isEmpty() ? "column" + (i + 1) : rawHeader; } List sampleLines = Arrays.asList(sample.split("\n")); @@ -63,7 +65,7 @@ public class DelimitedFileStructureFinder implements FileStructureFinder { List row = rows.get(index); int lineNumber = lineNumbers.get(index); Map sampleRecord = new LinkedHashMap<>(); - Util.filterListToMap(sampleRecord, headerWithNamedBlanks, + Util.filterListToMap(sampleRecord, columnNames, trimFields ? row.stream().map(String::trim).collect(Collectors.toList()) : row); sampleRecords.add(sampleRecord); sampleMessages.add( @@ -82,7 +84,7 @@ public class DelimitedFileStructureFinder implements FileStructureFinder { .setNumMessagesAnalyzed(sampleRecords.size()) .setHasHeaderRow(isHeaderInFile) .setDelimiter(delimiter) - .setInputFields(Arrays.stream(headerWithNamedBlanks).collect(Collectors.toList())); + .setColumnNames(Arrays.stream(columnNames).collect(Collectors.toList())); if (trimFields) { structureBuilder.setShouldTrimFields(true); @@ -225,7 +227,9 @@ public class DelimitedFileStructureFinder implements FileStructureFinder { // SuperCSV will put nulls in the header if any columns don't have names, but empty strings are better for us return new Tuple<>(true, firstRow.stream().map(field -> (field == null) ? "" : field).toArray(String[]::new)); } else { - return new Tuple<>(false, IntStream.rangeClosed(1, firstRow.size()).mapToObj(num -> "column" + num).toArray(String[]::new)); + String[] dummyHeader = new String[firstRow.size()]; + Arrays.fill(dummyHeader, ""); + return new Tuple<>(false, dummyHeader); } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java index 6d1f039399e..4e692d58391 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java @@ -45,7 +45,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { assertEquals(Character.valueOf(','), structure.getDelimiter()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); - assertEquals(Arrays.asList("time", "message"), structure.getInputFields()); + assertEquals(Arrays.asList("time", "message"), structure.getColumnNames()); assertNull(structure.getGrokPattern()); assertEquals("time", structure.getTimestampField()); assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); @@ -76,7 +76,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { assertEquals(Character.valueOf(','), structure.getDelimiter()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); - assertEquals(Arrays.asList("message", "time", "count"), structure.getInputFields()); + assertEquals(Arrays.asList("message", "time", "count"), structure.getColumnNames()); assertNull(structure.getGrokPattern()); assertEquals("time", structure.getTimestampField()); assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); @@ -114,7 +114,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { assertNull(structure.getShouldTrimFields()); assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance", "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax", - "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "column18", "column19"), structure.getInputFields()); + "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "column18", "column19"), structure.getColumnNames()); assertNull(structure.getGrokPattern()); assertEquals("tpep_pickup_datetime", structure.getTimestampField()); assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats()); @@ -152,7 +152,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { assertNull(structure.getShouldTrimFields()); assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance", "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax", - "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount"), structure.getInputFields()); + "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount"), structure.getColumnNames()); assertNull(structure.getGrokPattern()); assertEquals("tpep_pickup_datetime", structure.getTimestampField()); assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats()); @@ -183,7 +183,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { assertEquals(Character.valueOf(','), structure.getDelimiter()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); - assertEquals(Arrays.asList("pos_id", "trip_id", "latitude", "longitude", "altitude", "timestamp"), structure.getInputFields()); + assertEquals(Arrays.asList("pos_id", "trip_id", "latitude", "longitude", "altitude", "timestamp"), structure.getColumnNames()); assertNull(structure.getGrokPattern()); assertEquals("timestamp", structure.getTimestampField()); assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss.SSSSSS"), structure.getTimestampFormats()); @@ -213,7 +213,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { DelimitedFileStructureFinder.readRows(withoutHeader, CsvPreference.EXCEL_PREFERENCE).v1()); assertFalse(header.v1()); - assertThat(header.v2(), arrayContaining("column1", "column2", "column3", "column4")); + assertThat(header.v2(), arrayContaining("", "", "", "")); } public void testLevenshteinDistance() {