[ML] Fix detection of syslog-like timestamp in find_file_structure (#47970)

Usually syslog timestamps have two spaces before a single
digit day-of-month. However, in some non-syslog cases
where syslog-like timestamps are used there is only one
space. The grok pattern supports this, so the timestamp
parser should too. This change makes the
find_file_structure endpoint do this.

Also fixes another problem that the same test case
exposed in the find_file_structure endpoint, which was
that the exclude_lines_pattern for delimited files was
always created on the assumption the delimiter was a
comma. Now it is based on the actual delimiter.
This commit is contained in:
David Roberts 2019-10-13 20:06:42 +01:00
parent 742fa818b8
commit 46ae86ac31
5 changed files with 57 additions and 12 deletions

View File

@ -139,9 +139,11 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
String quote = String.valueOf(csvPreference.getQuoteChar());
String twoQuotes = quote + quote;
String optQuote = quote.replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + "?";
String delimiterMatcher =
(delimiter == '\t') ? "\\t" : String.valueOf(delimiter).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1");
structureBuilder.setExcludeLinesPattern("^" + Arrays.stream(header)
.map(column -> optQuote + column.replace(quote, twoQuotes).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + optQuote)
.collect(Collectors.joining(",")));
.collect(Collectors.joining(delimiterMatcher)));
}
boolean needClientTimeZone = timeField.v2().hasTimezoneDependentParsing();

View File

@ -145,7 +145,7 @@ public final class TimestampFormatFinder {
example -> CandidateTimestampFormat.expandDayAndAdjustFractionalSecondsFromExample(example, "MMM dd HH:mm:ss"),
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b",
"%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)(?:[:.,][0-9]{3,9})?\\b", "SYSLOGTIMESTAMP",
Arrays.asList(" 11 11 11 11", " 1 11 11 11"), 4, 10),
Arrays.asList(" 11 11 11 11", " 1 11 11 11"), 6, 10),
new CandidateTimestampFormat(example -> Collections.singletonList("dd/MMM/yyyy:HH:mm:ss XX"),
"\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ",
"\\b%{MONTHDAY}/%{MONTH}/%{YEAR}:%{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) [+-]?%{HOUR}%{MINUTE}\\b", "HTTPDATE",
@ -154,10 +154,10 @@ public final class TimestampFormatFinder {
"\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b",
"%{MONTH} %{MONTHDAY}, 20\\d{2} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:AM|PM)\\b", "CATALINA_DATESTAMP",
Arrays.asList(" 11 1111 1 11 11", " 11 1111 11 11 11"), 0, 3),
new CandidateTimestampFormat(example -> Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"),
new CandidateTimestampFormat(example -> Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"),
"\\b[A-Z]\\S{2} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b",
"%{MONTH} +%{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b", "CISCOTIMESTAMP",
Arrays.asList(" 11 1111 11 11 11", " 1 1111 11 11 11"), 0, 0),
Arrays.asList(" 11 1111 11 11 11", " 1 1111 11 11 11"), 1, 0),
new CandidateTimestampFormat(CandidateTimestampFormat::indeterminateDayMonthFormatFromExample,
"\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "\\b%{DATESTAMP}\\b", "DATESTAMP",
// In DATESTAMP the month may be 1 or 2 digits, but the day must be 2
@ -1467,7 +1467,7 @@ public final class TimestampFormatFinder {
static List<String> expandDayAndAdjustFractionalSecondsFromExample(String example, String formatWithddAndNoFraction) {
String formatWithdd = adjustFractionalSecondsFromEndOfExample(example, formatWithddAndNoFraction);
return Arrays.asList(formatWithdd, formatWithdd.replace(" dd", " d"));
return Arrays.asList(formatWithdd, formatWithdd.replace(" dd", " d"), formatWithdd.replace(" dd", " d"));
}
static List<String> indeterminateDayMonthFormatFromExample(String example) {

View File

@ -24,6 +24,7 @@ import static org.hamcrest.Matchers.equalTo;
public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', '"', 2, false);
private FileStructureFinderFactory tsvFactory = new DelimitedFileStructureFinderFactory('\t', '"', 3, false);
public void testCreateConfigsGivenCompleteCsv() throws Exception {
String sample = "time,message\n" +
@ -368,6 +369,47 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss.SSSSSS"), structure.getJodaTimestampFormats());
}
public void testCreateConfigsGivenTsvWithSyslogLikeTimestamp() throws Exception {
String sample = "Latitude\tLongitude\tloc\tTimestamp\n" +
"25.78042\t18.441196\t\"25.7804200000,18.4411960000\"\tJun 30 2019 13:21:24\n" +
"25.743484\t18.443047\t\"25.7434840000,18.4430470000\"\tJun 30 2019 06:02:35\n" +
"25.744583\t18.442783\t\"25.7445830000,18.4427830000\"\tJun 30 2019 06:02:35\n" +
"25.754593\t18.431637\t\"25.7545930000,18.4316370000\"\tJul 1 2019 06:02:43\n" +
"25.768574\t18.433483\t\"25.7685740000,18.4334830000\"\tJul 1 2019 06:21:28\n" +
"25.757736\t18.438683\t\"25.7577360000,18.4386830000\"\tJul 1 2019 12:06:08\n" +
"25.76615\t18.436565\t\"25.7661500000,18.4365650000\"\tJul 1 2019 12:06:08\n" +
"25.76896\t18.43586\t\"25.7689600000,18.4358600000\"\tJul 1 2019 12:13:50\n" +
"25.76423\t18.43705\t\"25.7642300000,18.4370500000\"\tJul 1 2019 12:39:10\n";
assertTrue(tsvFactory.canCreateFromSample(explanation, sample));
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = tsvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();
assertEquals(FileStructure.Format.DELIMITED, structure.getFormat());
assertEquals(charset, structure.getCharset());
if (hasByteOrderMarker == null) {
assertNull(structure.getHasByteOrderMarker());
} else {
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
}
assertEquals("^\"?Latitude\"?\\t\"?Longitude\"?\\t\"?loc\"?\\t\"?Timestamp\"?",
structure.getExcludeLinesPattern());
assertNull(structure.getMultilineStartPattern());
assertEquals(Character.valueOf('\t'), structure.getDelimiter());
assertEquals(Character.valueOf('"'), structure.getQuote());
assertTrue(structure.getHasHeaderRow());
assertNull(structure.getShouldTrimFields());
assertEquals(Arrays.asList("Latitude", "Longitude", "loc", "Timestamp"), structure.getColumnNames());
assertNull(structure.getGrokPattern());
assertEquals("Timestamp", structure.getTimestampField());
assertEquals(Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"),
structure.getJodaTimestampFormats());
}
public void testCreateConfigsGivenDotInFieldName() throws Exception {
String sample = "time.iso8601,message\n" +
"2018-05-17T13:41:23,hello\n" +

View File

@ -194,7 +194,7 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
assertNotNull(match);
assertEquals("time", match.v1());
assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"));
assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"));
assertEquals("CISCOTIMESTAMP", match.v2().getGrokPatternName());
}
@ -227,7 +227,7 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
assertNotNull(match);
assertEquals("time2", match.v1());
assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"));
assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"));
assertEquals("CISCOTIMESTAMP", match.v2().getGrokPatternName());
}

View File

@ -661,9 +661,9 @@ public class TimestampFormatFinderTests extends FileStructureTestCase {
"\\b[A-Z]\\S{2} [A-Z]\\S{2} \\d{2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", "EEE MMM dd HH:mm:ss yyyy", 1526400896000L);
validateTimestampMatch("May 15 17:14:56.725", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b",
Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), 1526400896725L);
Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), 1526400896725L);
validateTimestampMatch("May 15 17:14:56", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b",
Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), 1526400896000L);
Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss"), 1526400896000L);
validateTimestampMatch("15/May/2018:17:14:56 +0100", "HTTPDATE", "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ",
"dd/MMM/yyyy:HH:mm:ss XX", 1526400896000L);
@ -672,7 +672,7 @@ public class TimestampFormatFinderTests extends FileStructureTestCase {
"\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "MMM dd, yyyy h:mm:ss a", 1526400896000L);
validateTimestampMatch("May 15 2018 17:14:56", "CISCOTIMESTAMP", "\\b[A-Z]\\S{2} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b",
Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), 1526400896000L);
Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), 1526400896000L);
validateTimestampMatch("05/15/2018 17:14:56,374", "DATESTAMP",
"\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "MM/dd/yyyy HH:mm:ss,SSS", 1526400896374L);
@ -799,7 +799,8 @@ public class TimestampFormatFinderTests extends FileStructureTestCase {
validateFindInFullMessage("Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' " +
"opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed", "", "SYSLOGTIMESTAMP",
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"));
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b",
Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss"));
validateFindInFullMessage("559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t" +
"192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp", "559550912540598297\t", "TIMESTAMP_ISO8601",
@ -807,7 +808,7 @@ public class TimestampFormatFinderTests extends FileStructureTestCase {
validateFindInFullMessage("Sep 8 11:55:35 dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving " +
"'www.elastic.co/A/IN': 95.110.68.206#53", "", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b",
Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"));
Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss"));
validateFindInFullMessage("10-28-2016 16:22:47.636 +0200 ERROR Network - " +
"Error encountered for connection from src=192.168.0.1:12345. Local side shutting down", "", "DATESTAMP",