[ML] Fix detection of syslog-like timestamp in find_file_structure (#47970)
Usually syslog timestamps have two spaces before a single digit day-of-month. However, in some non-syslog cases where syslog-like timestamps are used there is only one space. The grok pattern supports this, so the timestamp parser should too. This change makes the find_file_structure endpoint do this. Also fixes another problem that the same test case exposed in the find_file_structure endpoint, which was that the exclude_lines_pattern for delimited files was always created on the assumption the delimiter was a comma. Now it is based on the actual delimiter.
This commit is contained in:
parent
742fa818b8
commit
46ae86ac31
|
@ -139,9 +139,11 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
|
|||
String quote = String.valueOf(csvPreference.getQuoteChar());
|
||||
String twoQuotes = quote + quote;
|
||||
String optQuote = quote.replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + "?";
|
||||
String delimiterMatcher =
|
||||
(delimiter == '\t') ? "\\t" : String.valueOf(delimiter).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1");
|
||||
structureBuilder.setExcludeLinesPattern("^" + Arrays.stream(header)
|
||||
.map(column -> optQuote + column.replace(quote, twoQuotes).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + optQuote)
|
||||
.collect(Collectors.joining(",")));
|
||||
.collect(Collectors.joining(delimiterMatcher)));
|
||||
}
|
||||
|
||||
boolean needClientTimeZone = timeField.v2().hasTimezoneDependentParsing();
|
||||
|
|
|
@ -145,7 +145,7 @@ public final class TimestampFormatFinder {
|
|||
example -> CandidateTimestampFormat.expandDayAndAdjustFractionalSecondsFromExample(example, "MMM dd HH:mm:ss"),
|
||||
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b",
|
||||
"%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)(?:[:.,][0-9]{3,9})?\\b", "SYSLOGTIMESTAMP",
|
||||
Arrays.asList(" 11 11 11 11", " 1 11 11 11"), 4, 10),
|
||||
Arrays.asList(" 11 11 11 11", " 1 11 11 11"), 6, 10),
|
||||
new CandidateTimestampFormat(example -> Collections.singletonList("dd/MMM/yyyy:HH:mm:ss XX"),
|
||||
"\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ",
|
||||
"\\b%{MONTHDAY}/%{MONTH}/%{YEAR}:%{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) [+-]?%{HOUR}%{MINUTE}\\b", "HTTPDATE",
|
||||
|
@ -154,10 +154,10 @@ public final class TimestampFormatFinder {
|
|||
"\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b",
|
||||
"%{MONTH} %{MONTHDAY}, 20\\d{2} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:AM|PM)\\b", "CATALINA_DATESTAMP",
|
||||
Arrays.asList(" 11 1111 1 11 11", " 11 1111 11 11 11"), 0, 3),
|
||||
new CandidateTimestampFormat(example -> Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"),
|
||||
new CandidateTimestampFormat(example -> Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"),
|
||||
"\\b[A-Z]\\S{2} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b",
|
||||
"%{MONTH} +%{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b", "CISCOTIMESTAMP",
|
||||
Arrays.asList(" 11 1111 11 11 11", " 1 1111 11 11 11"), 0, 0),
|
||||
Arrays.asList(" 11 1111 11 11 11", " 1 1111 11 11 11"), 1, 0),
|
||||
new CandidateTimestampFormat(CandidateTimestampFormat::indeterminateDayMonthFormatFromExample,
|
||||
"\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "\\b%{DATESTAMP}\\b", "DATESTAMP",
|
||||
// In DATESTAMP the month may be 1 or 2 digits, but the day must be 2
|
||||
|
@ -1467,7 +1467,7 @@ public final class TimestampFormatFinder {
|
|||
static List<String> expandDayAndAdjustFractionalSecondsFromExample(String example, String formatWithddAndNoFraction) {
|
||||
|
||||
String formatWithdd = adjustFractionalSecondsFromEndOfExample(example, formatWithddAndNoFraction);
|
||||
return Arrays.asList(formatWithdd, formatWithdd.replace(" dd", " d"));
|
||||
return Arrays.asList(formatWithdd, formatWithdd.replace(" dd", " d"), formatWithdd.replace(" dd", " d"));
|
||||
}
|
||||
|
||||
static List<String> indeterminateDayMonthFormatFromExample(String example) {
|
||||
|
|
|
@ -24,6 +24,7 @@ import static org.hamcrest.Matchers.equalTo;
|
|||
public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
||||
|
||||
private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', '"', 2, false);
|
||||
private FileStructureFinderFactory tsvFactory = new DelimitedFileStructureFinderFactory('\t', '"', 3, false);
|
||||
|
||||
public void testCreateConfigsGivenCompleteCsv() throws Exception {
|
||||
String sample = "time,message\n" +
|
||||
|
@ -368,6 +369,47 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|||
assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss.SSSSSS"), structure.getJodaTimestampFormats());
|
||||
}
|
||||
|
||||
public void testCreateConfigsGivenTsvWithSyslogLikeTimestamp() throws Exception {
|
||||
String sample = "Latitude\tLongitude\tloc\tTimestamp\n" +
|
||||
"25.78042\t18.441196\t\"25.7804200000,18.4411960000\"\tJun 30 2019 13:21:24\n" +
|
||||
"25.743484\t18.443047\t\"25.7434840000,18.4430470000\"\tJun 30 2019 06:02:35\n" +
|
||||
"25.744583\t18.442783\t\"25.7445830000,18.4427830000\"\tJun 30 2019 06:02:35\n" +
|
||||
"25.754593\t18.431637\t\"25.7545930000,18.4316370000\"\tJul 1 2019 06:02:43\n" +
|
||||
"25.768574\t18.433483\t\"25.7685740000,18.4334830000\"\tJul 1 2019 06:21:28\n" +
|
||||
"25.757736\t18.438683\t\"25.7577360000,18.4386830000\"\tJul 1 2019 12:06:08\n" +
|
||||
"25.76615\t18.436565\t\"25.7661500000,18.4365650000\"\tJul 1 2019 12:06:08\n" +
|
||||
"25.76896\t18.43586\t\"25.7689600000,18.4358600000\"\tJul 1 2019 12:13:50\n" +
|
||||
"25.76423\t18.43705\t\"25.7642300000,18.4370500000\"\tJul 1 2019 12:39:10\n";
|
||||
assertTrue(tsvFactory.canCreateFromSample(explanation, sample));
|
||||
|
||||
String charset = randomFrom(POSSIBLE_CHARSETS);
|
||||
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
||||
FileStructureFinder structureFinder = tsvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
|
||||
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
|
||||
|
||||
FileStructure structure = structureFinder.getStructure();
|
||||
|
||||
assertEquals(FileStructure.Format.DELIMITED, structure.getFormat());
|
||||
assertEquals(charset, structure.getCharset());
|
||||
if (hasByteOrderMarker == null) {
|
||||
assertNull(structure.getHasByteOrderMarker());
|
||||
} else {
|
||||
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
|
||||
}
|
||||
assertEquals("^\"?Latitude\"?\\t\"?Longitude\"?\\t\"?loc\"?\\t\"?Timestamp\"?",
|
||||
structure.getExcludeLinesPattern());
|
||||
assertNull(structure.getMultilineStartPattern());
|
||||
assertEquals(Character.valueOf('\t'), structure.getDelimiter());
|
||||
assertEquals(Character.valueOf('"'), structure.getQuote());
|
||||
assertTrue(structure.getHasHeaderRow());
|
||||
assertNull(structure.getShouldTrimFields());
|
||||
assertEquals(Arrays.asList("Latitude", "Longitude", "loc", "Timestamp"), structure.getColumnNames());
|
||||
assertNull(structure.getGrokPattern());
|
||||
assertEquals("Timestamp", structure.getTimestampField());
|
||||
assertEquals(Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"),
|
||||
structure.getJodaTimestampFormats());
|
||||
}
|
||||
|
||||
public void testCreateConfigsGivenDotInFieldName() throws Exception {
|
||||
String sample = "time.iso8601,message\n" +
|
||||
"2018-05-17T13:41:23,hello\n" +
|
||||
|
|
|
@ -194,7 +194,7 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
|
|||
EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
|
||||
assertNotNull(match);
|
||||
assertEquals("time", match.v1());
|
||||
assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"));
|
||||
assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"));
|
||||
assertEquals("CISCOTIMESTAMP", match.v2().getGrokPatternName());
|
||||
}
|
||||
|
||||
|
@ -227,7 +227,7 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
|
|||
EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
|
||||
assertNotNull(match);
|
||||
assertEquals("time2", match.v1());
|
||||
assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"));
|
||||
assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"));
|
||||
assertEquals("CISCOTIMESTAMP", match.v2().getGrokPatternName());
|
||||
}
|
||||
|
||||
|
|
|
@ -661,9 +661,9 @@ public class TimestampFormatFinderTests extends FileStructureTestCase {
|
|||
"\\b[A-Z]\\S{2} [A-Z]\\S{2} \\d{2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", "EEE MMM dd HH:mm:ss yyyy", 1526400896000L);
|
||||
|
||||
validateTimestampMatch("May 15 17:14:56.725", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b",
|
||||
Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), 1526400896725L);
|
||||
Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), 1526400896725L);
|
||||
validateTimestampMatch("May 15 17:14:56", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b",
|
||||
Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), 1526400896000L);
|
||||
Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss"), 1526400896000L);
|
||||
|
||||
validateTimestampMatch("15/May/2018:17:14:56 +0100", "HTTPDATE", "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ",
|
||||
"dd/MMM/yyyy:HH:mm:ss XX", 1526400896000L);
|
||||
|
@ -672,7 +672,7 @@ public class TimestampFormatFinderTests extends FileStructureTestCase {
|
|||
"\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "MMM dd, yyyy h:mm:ss a", 1526400896000L);
|
||||
|
||||
validateTimestampMatch("May 15 2018 17:14:56", "CISCOTIMESTAMP", "\\b[A-Z]\\S{2} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b",
|
||||
Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), 1526400896000L);
|
||||
Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), 1526400896000L);
|
||||
|
||||
validateTimestampMatch("05/15/2018 17:14:56,374", "DATESTAMP",
|
||||
"\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "MM/dd/yyyy HH:mm:ss,SSS", 1526400896374L);
|
||||
|
@ -799,7 +799,8 @@ public class TimestampFormatFinderTests extends FileStructureTestCase {
|
|||
|
||||
validateFindInFullMessage("Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' " +
|
||||
"opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed", "", "SYSLOGTIMESTAMP",
|
||||
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"));
|
||||
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b",
|
||||
Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss"));
|
||||
|
||||
validateFindInFullMessage("559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t" +
|
||||
"192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp", "559550912540598297\t", "TIMESTAMP_ISO8601",
|
||||
|
@ -807,7 +808,7 @@ public class TimestampFormatFinderTests extends FileStructureTestCase {
|
|||
|
||||
validateFindInFullMessage("Sep 8 11:55:35 dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving " +
|
||||
"'www.elastic.co/A/IN': 95.110.68.206#53", "", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b",
|
||||
Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"));
|
||||
Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss"));
|
||||
|
||||
validateFindInFullMessage("10-28-2016 16:22:47.636 +0200 ERROR Network - " +
|
||||
"Error encountered for connection from src=192.168.0.1:12345. Local side shutting down", "", "DATESTAMP",
|
||||
|
|
Loading…
Reference in New Issue