From 43665183c2d850f23237755449e9ca7c54965567 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Thu, 13 Jun 2019 13:14:33 +0100 Subject: [PATCH] [ML] Restrict detection of epoch timestamps in find_file_structure (#43188) Previously 10 digit numbers were considered candidates to be timestamps recorded as seconds since the epoch and 13 digit numbers as timestamps recorded as milliseconds since the epoch. However, this meant that we could detect these formats for numbers that would represent times far in the future. As an example ISBN numbers starting with 9 were detected as milliseconds since the epoch since they had 13 digits. This change tweaks the logic for detecting such timestamps to require that they begin with 1 or 2. This means that numbers that would represent times beyond about 2065 are no longer detected as epoch timestamps. (We can add 3 to the definition as we get closer to the cutoff date.) --- .../filestructurefinder/TimestampFormatFinder.java | 6 +++--- .../TimestampFormatFinderTests.java | 13 +++++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java index 0283437d648..9dec0ddbf5b 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java @@ -97,10 +97,10 @@ public final class TimestampFormatFinder { "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", "\\b%{TIMESTAMP_ISO8601}\\b", "TIMESTAMP_ISO8601", "1111 11 11 11 11", 0, 19); static final CandidateTimestampFormat UNIX_MS_CANDIDATE_FORMAT = - new CandidateTimestampFormat(example -> Collections.singletonList("UNIX_MS"), "\\b\\d{13}\\b", "\\b\\d{13}\\b", "POSINT", + new CandidateTimestampFormat(example -> Collections.singletonList("UNIX_MS"), "\\b\\d{13}\\b", "\\b[12]\\d{12}\\b", "POSINT", "1111111111111", 0, 0); static final CandidateTimestampFormat UNIX_CANDIDATE_FORMAT = - new CandidateTimestampFormat(example -> Collections.singletonList("UNIX"), "\\b\\d{10}\\b", "\\b\\d{10}(?:\\.\\d{3,9})?\\b", + new CandidateTimestampFormat(example -> Collections.singletonList("UNIX"), "\\b\\d{10}\\b", "\\b[12]\\d{9}(?:\\.\\d{3,9})?\\b", "NUMBER", "1111111111", 0, 10); static final CandidateTimestampFormat TAI64N_CANDIDATE_FORMAT = new CandidateTimestampFormat(example -> Collections.singletonList("TAI64N"), "\\b[0-9A-Fa-f]{24}\\b", "\\b[0-9A-Fa-f]{24}\\b", @@ -275,7 +275,7 @@ public final class TimestampFormatFinder { "ss".equals(prevLetterGroup) == false || endPos - startPos > 9) { String msg = "Letter group [" + letterGroup + "] in [" + overrideFormat + "] is not supported"; if (curChar == 'S') { - msg += " because it is not preceeded by [ss] and a separator from [" + FRACTIONAL_SECOND_SEPARATORS + "]"; + msg += " because it is not preceded by [ss] and a separator from [" + FRACTIONAL_SECOND_SEPARATORS + "]"; } throw new IllegalArgumentException(msg); } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java index b80e8a5712a..0b872e630b6 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java @@ -136,11 +136,11 @@ public class TimestampFormatFinderTests extends FileStructureTestCase { e = expectThrows(IllegalArgumentException.class, () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy H:mm:ss+SSSSSS")); assertEquals("Letter group [SSSSSS] in [MM/dd/yyyy H:mm:ss+SSSSSS] is not supported" - + " because it is not preceeded by [ss] and a separator from [:.,]", e.getMessage()); + + " because it is not preceded by [ss] and a separator from [:.,]", e.getMessage()); e = expectThrows(IllegalArgumentException.class, () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy H:mm,SSSSSS")); assertEquals("Letter group [SSSSSS] in [MM/dd/yyyy H:mm,SSSSSS] is not supported" - + " because it is not preceeded by [ss] and a separator from [:.,]", e.getMessage()); + + " because it is not preceded by [ss] and a separator from [:.,]", e.getMessage()); e = expectThrows(IllegalArgumentException.class, () -> TimestampFormatFinder.overrideFormatToGrokAndRegex(" 'T' ")); assertEquals("No time format letter groups in override format [ 'T' ]", e.getMessage()); @@ -562,6 +562,11 @@ public class TimestampFormatFinderTests extends FileStructureTestCase { validateNoTimestampMatch("no timestamps in here"); validateNoTimestampMatch(":::"); validateNoTimestampMatch("/+"); + // These two don't match because they're too far in the future + // when interpreted as seconds/milliseconds from the epoch + // (they need to be 10 or 13 digits beginning with 1 or 2) + validateNoTimestampMatch("3213213213"); + validateNoTimestampMatch("9789522792167"); } public void testFindFormatGivenOnlyIso8601() { @@ -693,10 +698,14 @@ public class TimestampFormatFinderTests extends FileStructureTestCase { public void testFindFormatGivenOnlySystemDate() { + validateTimestampMatch("1000000000000", "POSINT", "\\b\\d{13}\\b", "UNIX_MS", 1000000000000L); validateTimestampMatch("1526400896374", "POSINT", "\\b\\d{13}\\b", "UNIX_MS", 1526400896374L); + validateTimestampMatch("2999999999999", "POSINT", "\\b\\d{13}\\b", "UNIX_MS", 2999999999999L); + validateTimestampMatch("1000000000", "NUMBER", "\\b\\d{10}\\b", "UNIX", 1000000000000L); validateTimestampMatch("1526400896.736", "NUMBER", "\\b\\d{10}\\b", "UNIX", 1526400896736L); validateTimestampMatch("1526400896", "NUMBER", "\\b\\d{10}\\b", "UNIX", 1526400896000L); + validateTimestampMatch("2999999999.999", "NUMBER", "\\b\\d{10}\\b", "UNIX", 2999999999999L); validateTimestampMatch("400000005afb078a164ac980", "BASE16NUM", "\\b[0-9A-Fa-f]{24}\\b", "TAI64N", 1526400896374L); }