[ML] Improve multiline_start_pattern for CSV in find_file_structure (#51737)
The work to switch file upload over to treating delimited files like semi-structured text and using the ingest pipeline for CSV parsing makes the multi-line start pattern used for delimited files much more critical than it used to be. Previously it was always based on the time field, even if that was towards the end of the columns, and no multi-line pattern was created if no timestamp was detected. This change improves the multi-line start pattern by: 1. Never creating a multi-line pattern if the sample contained only single line records. This improves the import efficiency in a common case. 2. Choosing the leftmost field that has a well-defined pattern, whether that be the time field or a boolean/numeric field. This reduces the risk of a field with newlines occurring earlier, and also means the algorithm doesn't automatically fail for data without a timestamp.
This commit is contained in:
parent
c2b08bb72f
commit
9d55c45b5a
|
@ -72,10 +72,11 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
|
|||
}
|
||||
}
|
||||
|
||||
int maxLinesPerMessage = 1;
|
||||
List<String> sampleLines = Arrays.asList(sample.split("\n"));
|
||||
List<String> sampleMessages = new ArrayList<>();
|
||||
List<Map<String, ?>> sampleRecords = new ArrayList<>();
|
||||
int prevMessageEndLineNumber = isHeaderInFile ? lineNumbers.get(0) : -1;
|
||||
int prevMessageEndLineNumber = isHeaderInFile ? lineNumbers.get(0) : 0; // This is an exclusive end
|
||||
for (int index = isHeaderInFile ? 1 : 0; index < rows.size(); ++index) {
|
||||
List<String> row = rows.get(index);
|
||||
int lineNumber = lineNumbers.get(index);
|
||||
|
@ -83,8 +84,8 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
|
|||
Util.filterListToMap(sampleRecord, columnNames,
|
||||
trimFields ? row.stream().map(field -> (field == null) ? null : field.trim()).collect(Collectors.toList()) : row);
|
||||
sampleRecords.add(sampleRecord);
|
||||
sampleMessages.add(
|
||||
String.join("\n", sampleLines.subList(prevMessageEndLineNumber + 1, lineNumbers.get(index))));
|
||||
sampleMessages.add(String.join("\n", sampleLines.subList(prevMessageEndLineNumber, lineNumber)));
|
||||
maxLinesPerMessage = Math.max(maxLinesPerMessage, lineNumber - prevMessageEndLineNumber);
|
||||
prevMessageEndLineNumber = lineNumber;
|
||||
}
|
||||
|
||||
|
@ -102,8 +103,7 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
|
|||
char delimiter = (char) csvPreference.getDelimiterChar();
|
||||
char quoteChar = csvPreference.getQuoteChar();
|
||||
|
||||
Map<String, Object> csvProcessorSettings = makeCsvProcessorSettings("message", columnNamesList, delimiter, quoteChar,
|
||||
trimFields);
|
||||
Map<String, Object> csvProcessorSettings = makeCsvProcessorSettings("message", columnNamesList, delimiter, quoteChar, trimFields);
|
||||
|
||||
FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.DELIMITED)
|
||||
.setCharset(charsetName)
|
||||
|
@ -116,15 +116,17 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
|
|||
.setQuote(quoteChar)
|
||||
.setColumnNames(columnNamesList);
|
||||
|
||||
String quote = String.valueOf(quoteChar);
|
||||
String twoQuotes = quote + quote;
|
||||
String quotePattern = quote.replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1");
|
||||
String optQuotePattern = quotePattern + "?";
|
||||
String delimiterPattern =
|
||||
(delimiter == '\t') ? "\\t" : String.valueOf(delimiter).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1");
|
||||
if (isHeaderInFile) {
|
||||
String quote = String.valueOf(quoteChar);
|
||||
String twoQuotes = quote + quote;
|
||||
String optQuote = quote.replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + "?";
|
||||
String delimiterMatcher =
|
||||
(delimiter == '\t') ? "\\t" : String.valueOf(delimiter).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1");
|
||||
structureBuilder.setExcludeLinesPattern("^" + Arrays.stream(header)
|
||||
.map(column -> optQuote + column.replace(quote, twoQuotes).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + optQuote)
|
||||
.collect(Collectors.joining(delimiterMatcher)));
|
||||
.map(column ->
|
||||
optQuotePattern + column.replace(quote, twoQuotes).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + optQuotePattern)
|
||||
.collect(Collectors.joining(delimiterPattern)));
|
||||
}
|
||||
|
||||
if (trimFields) {
|
||||
|
@ -134,28 +136,6 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
|
|||
Tuple<String, TimestampFormatFinder> timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides,
|
||||
timeoutChecker);
|
||||
if (timeField != null) {
|
||||
String timeLineRegex = null;
|
||||
StringBuilder builder = new StringBuilder("^");
|
||||
// We make the assumption that the timestamp will be on the first line of each record. Therefore, if the
|
||||
// timestamp is the last column then either our assumption is wrong (and the approach will completely
|
||||
// break down) or else every record is on a single line and there's no point creating a multiline config.
|
||||
// This is why the loop excludes the last column.
|
||||
for (String column : Arrays.asList(columnNames).subList(0, columnNames.length - 1)) {
|
||||
if (timeField.v1().equals(column)) {
|
||||
builder.append("\"?");
|
||||
String simpleTimePattern = timeField.v2().getSimplePattern().pattern();
|
||||
builder.append(simpleTimePattern.startsWith("\\b") ? simpleTimePattern.substring(2) : simpleTimePattern);
|
||||
timeLineRegex = builder.toString();
|
||||
break;
|
||||
} else {
|
||||
builder.append(".*?");
|
||||
if (delimiter == '\t') {
|
||||
builder.append("\\t");
|
||||
} else {
|
||||
builder.append(delimiter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
boolean needClientTimeZone = timeField.v2().hasTimezoneDependentParsing();
|
||||
|
||||
|
@ -165,14 +145,15 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
|
|||
.setNeedClientTimezone(needClientTimeZone)
|
||||
.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), csvProcessorSettings,
|
||||
mappings, timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone))
|
||||
.setMultilineStartPattern(timeLineRegex);
|
||||
.setMultilineStartPattern(makeMultilineStartPattern(explanation, columnNamesList, maxLinesPerMessage, delimiterPattern,
|
||||
quotePattern, mappings, timeField.v1(), timeField.v2()));
|
||||
|
||||
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
|
||||
} else {
|
||||
structureBuilder.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(),
|
||||
csvProcessorSettings, mappings, null, null, false));
|
||||
}
|
||||
|
||||
if (timeField != null) {
|
||||
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
|
||||
structureBuilder.setMultilineStartPattern(makeMultilineStartPattern(explanation, columnNamesList, maxLinesPerMessage,
|
||||
delimiterPattern, quotePattern, mappings, null, null));
|
||||
}
|
||||
|
||||
if (mappingsAndFieldStats.v2() != null) {
|
||||
|
@ -609,4 +590,84 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
|
|||
}
|
||||
return Collections.unmodifiableMap(csvProcessorSettings);
|
||||
}
|
||||
|
||||
/**
|
||||
* The multi-line start pattern is based on the first field in the line that is boolean, numeric
|
||||
* or the detected timestamp, and consists of a pattern matching that field, preceded by wildcards
|
||||
* to match any prior fields and to match the delimiters in between them.
|
||||
*
|
||||
* This is based on the observation that a boolean, numeric or timestamp field will not contain a
|
||||
* newline.
|
||||
*
|
||||
* The approach works best when the chosen field is early in each record, ideally the very first
|
||||
* field. It doesn't work when fields prior to the chosen field contain newlines in some of the
|
||||
* records.
|
||||
*/
|
||||
static String makeMultilineStartPattern(List<String> explanation, List<String> columnNames, int maxLinesPerMessage,
|
||||
String delimiterPattern, String quotePattern, Map<String, Object> mappings,
|
||||
String timeFieldName, TimestampFormatFinder timeFieldFormat) {
|
||||
|
||||
assert columnNames.isEmpty() == false;
|
||||
assert maxLinesPerMessage > 0;
|
||||
assert (timeFieldName == null) == (timeFieldFormat == null);
|
||||
|
||||
// This is the easy case: a file where there are no multi-line fields
|
||||
if (maxLinesPerMessage == 1) {
|
||||
explanation.add("Not creating a multi-line start pattern as no sampled message spanned multiple lines");
|
||||
return null;
|
||||
}
|
||||
|
||||
StringBuilder builder = new StringBuilder("^");
|
||||
// Look for a field early in the line that cannot be a multi-line field based on the type we've determined for
|
||||
// it, and create a pattern that matches this field with the appropriate number of delimiters before it.
|
||||
// There is no point doing this for the last field on the line, so this is why the loop excludes the last column.
|
||||
for (String columnName : columnNames.subList(0, columnNames.size() - 1)) {
|
||||
if (columnName.equals(timeFieldName)) {
|
||||
builder.append(quotePattern).append("?");
|
||||
String simpleTimePattern = timeFieldFormat.getSimplePattern().pattern();
|
||||
builder.append(simpleTimePattern.startsWith("\\b") ? simpleTimePattern.substring(2) : simpleTimePattern);
|
||||
explanation.add("Created a multi-line start pattern based on timestamp column [" + columnName + "]");
|
||||
return builder.toString();
|
||||
}
|
||||
Object columnMapping = mappings.get(columnName);
|
||||
if (columnMapping instanceof Map) {
|
||||
String type = (String) ((Map<?, ?>) columnMapping).get(FileStructureUtils.MAPPING_TYPE_SETTING);
|
||||
if (type != null) {
|
||||
String columnPattern;
|
||||
switch (type) {
|
||||
case "boolean":
|
||||
columnPattern = "(?:true|false)";
|
||||
break;
|
||||
case "byte":
|
||||
case "short":
|
||||
case "integer":
|
||||
case "long":
|
||||
columnPattern = "[+-]?\\d+";
|
||||
break;
|
||||
case "half_float":
|
||||
case "float":
|
||||
case "double":
|
||||
columnPattern = "[+-]?(?:\\d+(?:\\.\\d+)?|\\.\\d+)(?:[eE][+-]?\\d+)?";
|
||||
break;
|
||||
default:
|
||||
columnPattern = null;
|
||||
break;
|
||||
}
|
||||
if (columnPattern != null) {
|
||||
builder.append("(?:").append(columnPattern).append("|")
|
||||
.append(quotePattern).append(columnPattern).append(quotePattern).append(")")
|
||||
.append(delimiterPattern);
|
||||
explanation.add("Created a multi-line start pattern based on [" + type + "] column [" + columnName + "]");
|
||||
return builder.toString();
|
||||
}
|
||||
}
|
||||
}
|
||||
builder.append(".*?").append(delimiterPattern);
|
||||
}
|
||||
// TODO: if this happens a lot then we should try looking for the a multi-line END pattern instead of a start pattern.
|
||||
// But this would require changing the find_file_structure response, and the file upload UI, and would make creating Filebeat
|
||||
// configs from the find_file_structure response more complex, so let's wait to see if there's significant demand.
|
||||
explanation.add("Failed to create a suitable multi-line start pattern");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,10 +16,14 @@ import java.util.BitSet;
|
|||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.elasticsearch.xpack.ml.filestructurefinder.DelimitedFileStructureFinder.levenshteinFieldwiseCompareRows;
|
||||
import static org.elasticsearch.xpack.ml.filestructurefinder.DelimitedFileStructureFinder.levenshteinDistance;
|
||||
import static org.hamcrest.Matchers.arrayContaining;
|
||||
import static org.hamcrest.Matchers.contains;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.hasKey;
|
||||
import static org.hamcrest.Matchers.not;
|
||||
|
@ -50,7 +54,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|||
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
|
||||
}
|
||||
assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern());
|
||||
assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern());
|
||||
assertNull(structure.getMultilineStartPattern());
|
||||
assertEquals(Character.valueOf(','), structure.getDelimiter());
|
||||
assertEquals(Character.valueOf('"'), structure.getQuote());
|
||||
assertTrue(structure.getHasHeaderRow());
|
||||
|
@ -85,7 +89,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|||
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
|
||||
}
|
||||
assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern());
|
||||
assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern());
|
||||
assertNull(structure.getMultilineStartPattern());
|
||||
assertEquals(Character.valueOf(','), structure.getDelimiter());
|
||||
assertEquals(Character.valueOf('"'), structure.getQuote());
|
||||
assertTrue(structure.getHasHeaderRow());
|
||||
|
@ -134,10 +138,10 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|||
}
|
||||
|
||||
public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception {
|
||||
String sample = "message,time,count\n" +
|
||||
"\"hello\n" +
|
||||
"world\",2018-05-17T13:41:23,1\n" +
|
||||
"\"hello again\n"; // note that this last record is truncated
|
||||
String sample = "time,message,count\n" +
|
||||
"2018-05-17T13:41:23,\"hello\n" +
|
||||
"world\",1\n" +
|
||||
"2019-01-18T14:46:57,\"hello again\n"; // note that this last record is truncated
|
||||
assertTrue(csvFactory.canCreateFromSample(explanation, sample));
|
||||
|
||||
String charset = randomFrom(POSSIBLE_CHARSETS);
|
||||
|
@ -154,13 +158,13 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|||
} else {
|
||||
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
|
||||
}
|
||||
assertEquals("^\"?message\"?,\"?time\"?,\"?count\"?", structure.getExcludeLinesPattern());
|
||||
assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern());
|
||||
assertEquals("^\"?time\"?,\"?message\"?,\"?count\"?", structure.getExcludeLinesPattern());
|
||||
assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern());
|
||||
assertEquals(Character.valueOf(','), structure.getDelimiter());
|
||||
assertEquals(Character.valueOf('"'), structure.getQuote());
|
||||
assertTrue(structure.getHasHeaderRow());
|
||||
assertNull(structure.getShouldTrimFields());
|
||||
assertEquals(Arrays.asList("message", "time", "count"), structure.getColumnNames());
|
||||
assertEquals(Arrays.asList("time", "message", "count"), structure.getColumnNames());
|
||||
assertNull(structure.getGrokPattern());
|
||||
assertEquals("time", structure.getTimestampField());
|
||||
assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats());
|
||||
|
@ -193,7 +197,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|||
"\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," +
|
||||
"\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?,\"?\"?,\"?\"?",
|
||||
structure.getExcludeLinesPattern());
|
||||
assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern());
|
||||
assertNull(structure.getMultilineStartPattern());
|
||||
assertEquals(Character.valueOf(','), structure.getDelimiter());
|
||||
assertEquals(Character.valueOf('"'), structure.getQuote());
|
||||
assertTrue(structure.getHasHeaderRow());
|
||||
|
@ -238,7 +242,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|||
"\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," +
|
||||
"\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?,\"?\"?,\"?\"?",
|
||||
structure.getExcludeLinesPattern());
|
||||
assertEquals("^.*?,.*?,\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern());
|
||||
assertNull(structure.getMultilineStartPattern());
|
||||
assertEquals(Character.valueOf(','), structure.getDelimiter());
|
||||
assertEquals(Character.valueOf('"'), structure.getQuote());
|
||||
assertTrue(structure.getHasHeaderRow());
|
||||
|
@ -278,7 +282,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|||
"\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," +
|
||||
"\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?",
|
||||
structure.getExcludeLinesPattern());
|
||||
assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern());
|
||||
assertNull(structure.getMultilineStartPattern());
|
||||
assertEquals(Character.valueOf(','), structure.getDelimiter());
|
||||
assertEquals(Character.valueOf('"'), structure.getQuote());
|
||||
assertTrue(structure.getHasHeaderRow());
|
||||
|
@ -325,7 +329,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|||
"\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," +
|
||||
"\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?",
|
||||
structure.getExcludeLinesPattern());
|
||||
assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern());
|
||||
assertNull(structure.getMultilineStartPattern());
|
||||
assertEquals(Character.valueOf(','), structure.getDelimiter());
|
||||
assertEquals(Character.valueOf('"'), structure.getQuote());
|
||||
assertTrue(structure.getHasHeaderRow());
|
||||
|
@ -435,7 +439,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|||
}
|
||||
// The exclude pattern needs to work on the raw text, so reflects the unmodified field names
|
||||
assertEquals("^\"?time\\.iso8601\"?,\"?message\"?", structure.getExcludeLinesPattern());
|
||||
assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern());
|
||||
assertNull(structure.getMultilineStartPattern());
|
||||
assertEquals(Character.valueOf(','), structure.getDelimiter());
|
||||
assertEquals(Character.valueOf('"'), structure.getQuote());
|
||||
assertTrue(structure.getHasHeaderRow());
|
||||
|
@ -590,7 +594,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|||
public void testMakeCsvProcessorSettings() {
|
||||
|
||||
String field = randomAlphaOfLength(10);
|
||||
List<String> targetFields = Arrays.asList(generateRandomStringArray(10, field.length() - 1, false , false));
|
||||
List<String> targetFields = Arrays.asList(generateRandomStringArray(10, field.length() - 1, false, false));
|
||||
char separator = randomFrom(',', ';', '\t', '|');
|
||||
char quote = randomFrom('"', '\'');
|
||||
boolean trim = randomBoolean();
|
||||
|
@ -615,10 +619,99 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testMultilineStartPatternGivenNoMultiline() {
|
||||
|
||||
List<String> columnNames = Stream.generate(() -> randomAlphaOfLengthBetween(5, 10)).limit(10).collect(Collectors.toList());
|
||||
String timeFieldName;
|
||||
TimestampFormatFinder timeFieldFormat;
|
||||
if (randomBoolean()) {
|
||||
timeFieldName = columnNames.get(randomIntBetween(0, columnNames.size() - 1));
|
||||
timeFieldFormat = new TimestampFormatFinder(explanation, true, true, true, NOOP_TIMEOUT_CHECKER);
|
||||
timeFieldFormat.addSample("2020-01-30T15:05:09");
|
||||
} else {
|
||||
timeFieldName = null;
|
||||
timeFieldFormat = null;
|
||||
}
|
||||
Map<String, Object> mappings = new TreeMap<>();
|
||||
for (String columnName : columnNames) {
|
||||
if (columnName.equals(timeFieldName)) {
|
||||
mappings.put(columnName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"));
|
||||
} else {
|
||||
mappings.put(columnName,
|
||||
Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING,
|
||||
randomFrom("boolean", "long", "double", "text", "keyword")));
|
||||
}
|
||||
}
|
||||
|
||||
assertNull(DelimitedFileStructureFinder.makeMultilineStartPattern(explanation, columnNames, 1, ",", "\"", mappings, timeFieldName,
|
||||
timeFieldFormat));
|
||||
assertThat(explanation, contains("Not creating a multi-line start pattern as no sampled message spanned multiple lines"));
|
||||
}
|
||||
|
||||
public void testMultilineStartPatternFromTimeField() {
|
||||
|
||||
List<String> columnNames = Stream.generate(() -> randomAlphaOfLengthBetween(5, 10)).limit(10).collect(Collectors.toList());
|
||||
int timeFieldColumnIndex = randomIntBetween(0, columnNames.size() - 2);
|
||||
String timeFieldName = columnNames.get(timeFieldColumnIndex);
|
||||
TimestampFormatFinder timeFieldFormat = new TimestampFormatFinder(explanation, true, true, true, NOOP_TIMEOUT_CHECKER);
|
||||
timeFieldFormat.addSample("2020-01-30T15:05:09");
|
||||
Map<String, Object> mappings = new TreeMap<>();
|
||||
for (String columnName : columnNames) {
|
||||
if (columnName.equals(timeFieldName)) {
|
||||
mappings.put(columnName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"));
|
||||
} else {
|
||||
mappings.put(columnName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, randomFrom("text", "keyword")));
|
||||
}
|
||||
}
|
||||
|
||||
String expected = "^" + Stream.generate(() -> ".*?,").limit(timeFieldColumnIndex).collect(Collectors.joining()) +
|
||||
"\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}";
|
||||
assertEquals(expected, DelimitedFileStructureFinder.makeMultilineStartPattern(explanation, columnNames, 2, ",", "\"", mappings,
|
||||
timeFieldName, timeFieldFormat));
|
||||
assertThat(explanation, contains("Created a multi-line start pattern based on timestamp column [" + timeFieldName + "]"));
|
||||
}
|
||||
|
||||
public void testMultilineStartPatternFromMappings() {
|
||||
|
||||
int randomIndex = randomIntBetween(0, 2);
|
||||
String type = new String[]{ "boolean", "long", "double" }[randomIndex];
|
||||
String expectedTypePattern =
|
||||
new String[]{ "(?:true|false)", "[+-]?\\d+", "[+-]?(?:\\d+(?:\\.\\d+)?|\\.\\d+)(?:[eE][+-]?\\d+)?" }[randomIndex];
|
||||
List<String> columnNames = Stream.generate(() -> randomAlphaOfLengthBetween(5, 10)).limit(10).collect(Collectors.toList());
|
||||
int chosenFieldColumnIndex = randomIntBetween(0, columnNames.size() - 2);
|
||||
String chosenField = columnNames.get(chosenFieldColumnIndex);
|
||||
Map<String, Object> mappings = new TreeMap<>();
|
||||
for (String columnName : columnNames) {
|
||||
if (columnName.equals(chosenField)) {
|
||||
mappings.put(columnName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, type));
|
||||
} else {
|
||||
mappings.put(columnName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, randomFrom("text", "keyword")));
|
||||
}
|
||||
}
|
||||
|
||||
String expected = "^" + Stream.generate(() -> ".*?,").limit(chosenFieldColumnIndex).collect(Collectors.joining()) +
|
||||
"(?:" + expectedTypePattern + "|\"" + expectedTypePattern + "\"),";
|
||||
assertEquals(expected, DelimitedFileStructureFinder.makeMultilineStartPattern(explanation, columnNames, 2, ",", "\"", mappings,
|
||||
null, null));
|
||||
assertThat(explanation, contains("Created a multi-line start pattern based on [" + type + "] column [" + chosenField + "]"));
|
||||
}
|
||||
|
||||
public void testMultilineStartPatternDeterminationTooHard() {
|
||||
|
||||
List<String> columnNames = Stream.generate(() -> randomAlphaOfLengthBetween(5, 10)).limit(10).collect(Collectors.toList());
|
||||
Map<String, Object> mappings = new TreeMap<>();
|
||||
for (String columnName : columnNames) {
|
||||
mappings.put(columnName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, randomFrom("text", "keyword")));
|
||||
}
|
||||
|
||||
assertNull(DelimitedFileStructureFinder.makeMultilineStartPattern(explanation, columnNames, 2, ",", "\"", mappings, null, null));
|
||||
assertThat(explanation, contains("Failed to create a suitable multi-line start pattern"));
|
||||
}
|
||||
|
||||
static Map<String, Object> randomCsvProcessorSettings() {
|
||||
String field = randomAlphaOfLength(10);
|
||||
return DelimitedFileStructureFinder.makeCsvProcessorSettings(field,
|
||||
Arrays.asList(generateRandomStringArray(10, field.length() - 1, false , false)), randomFrom(',', ';', '\t', '|'),
|
||||
Arrays.asList(generateRandomStringArray(10, field.length() - 1, false, false)), randomFrom(',', ';', '\t', '|'),
|
||||
randomFrom('"', '\''), randomBoolean());
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue