[ML] Restrict detection of epoch timestamps in find_file_structure (#43188)

Previously 10 digit numbers were considered candidates to be
timestamps recorded as seconds since the epoch and 13 digit
numbers as timestamps recorded as milliseconds since the epoch.

However, this meant that we could detect these formats for
numbers that would represent times far in the future.  As an
example ISBN numbers starting with 9 were detected as milliseconds
since the epoch since they had 13 digits.

This change tweaks the logic for detecting such timestamps to
require that they begin with 1 or 2.  This means that numbers
that would represent times beyond about 2065 are no longer
detected as epoch timestamps.  (We can add 3 to the definition
as we get closer to the cutoff date.)
This commit is contained in:
David Roberts 2019-06-13 13:14:33 +01:00
parent 8b3716553a
commit 43665183c2
2 changed files with 14 additions and 5 deletions

View File

@ -97,10 +97,10 @@ public final class TimestampFormatFinder {
"\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", "\\b%{TIMESTAMP_ISO8601}\\b", "TIMESTAMP_ISO8601",
"1111 11 11 11 11", 0, 19);
static final CandidateTimestampFormat UNIX_MS_CANDIDATE_FORMAT =
new CandidateTimestampFormat(example -> Collections.singletonList("UNIX_MS"), "\\b\\d{13}\\b", "\\b\\d{13}\\b", "POSINT",
new CandidateTimestampFormat(example -> Collections.singletonList("UNIX_MS"), "\\b\\d{13}\\b", "\\b[12]\\d{12}\\b", "POSINT",
"1111111111111", 0, 0);
static final CandidateTimestampFormat UNIX_CANDIDATE_FORMAT =
new CandidateTimestampFormat(example -> Collections.singletonList("UNIX"), "\\b\\d{10}\\b", "\\b\\d{10}(?:\\.\\d{3,9})?\\b",
new CandidateTimestampFormat(example -> Collections.singletonList("UNIX"), "\\b\\d{10}\\b", "\\b[12]\\d{9}(?:\\.\\d{3,9})?\\b",
"NUMBER", "1111111111", 0, 10);
static final CandidateTimestampFormat TAI64N_CANDIDATE_FORMAT =
new CandidateTimestampFormat(example -> Collections.singletonList("TAI64N"), "\\b[0-9A-Fa-f]{24}\\b", "\\b[0-9A-Fa-f]{24}\\b",
@ -275,7 +275,7 @@ public final class TimestampFormatFinder {
"ss".equals(prevLetterGroup) == false || endPos - startPos > 9) {
String msg = "Letter group [" + letterGroup + "] in [" + overrideFormat + "] is not supported";
if (curChar == 'S') {
msg += " because it is not preceeded by [ss] and a separator from [" + FRACTIONAL_SECOND_SEPARATORS + "]";
msg += " because it is not preceded by [ss] and a separator from [" + FRACTIONAL_SECOND_SEPARATORS + "]";
}
throw new IllegalArgumentException(msg);
}

View File

@ -136,11 +136,11 @@ public class TimestampFormatFinderTests extends FileStructureTestCase {
e = expectThrows(IllegalArgumentException.class,
() -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy H:mm:ss+SSSSSS"));
assertEquals("Letter group [SSSSSS] in [MM/dd/yyyy H:mm:ss+SSSSSS] is not supported"
+ " because it is not preceeded by [ss] and a separator from [:.,]", e.getMessage());
+ " because it is not preceded by [ss] and a separator from [:.,]", e.getMessage());
e = expectThrows(IllegalArgumentException.class,
() -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy H:mm,SSSSSS"));
assertEquals("Letter group [SSSSSS] in [MM/dd/yyyy H:mm,SSSSSS] is not supported"
+ " because it is not preceeded by [ss] and a separator from [:.,]", e.getMessage());
+ " because it is not preceded by [ss] and a separator from [:.,]", e.getMessage());
e = expectThrows(IllegalArgumentException.class,
() -> TimestampFormatFinder.overrideFormatToGrokAndRegex(" 'T' "));
assertEquals("No time format letter groups in override format [ 'T' ]", e.getMessage());
@ -562,6 +562,11 @@ public class TimestampFormatFinderTests extends FileStructureTestCase {
validateNoTimestampMatch("no timestamps in here");
validateNoTimestampMatch(":::");
validateNoTimestampMatch("/+");
// These two don't match because they're too far in the future
// when interpreted as seconds/milliseconds from the epoch
// (they need to be 10 or 13 digits beginning with 1 or 2)
validateNoTimestampMatch("3213213213");
validateNoTimestampMatch("9789522792167");
}
public void testFindFormatGivenOnlyIso8601() {
@ -693,10 +698,14 @@ public class TimestampFormatFinderTests extends FileStructureTestCase {
public void testFindFormatGivenOnlySystemDate() {
validateTimestampMatch("1000000000000", "POSINT", "\\b\\d{13}\\b", "UNIX_MS", 1000000000000L);
validateTimestampMatch("1526400896374", "POSINT", "\\b\\d{13}\\b", "UNIX_MS", 1526400896374L);
validateTimestampMatch("2999999999999", "POSINT", "\\b\\d{13}\\b", "UNIX_MS", 2999999999999L);
validateTimestampMatch("1000000000", "NUMBER", "\\b\\d{10}\\b", "UNIX", 1000000000000L);
validateTimestampMatch("1526400896.736", "NUMBER", "\\b\\d{10}\\b", "UNIX", 1526400896736L);
validateTimestampMatch("1526400896", "NUMBER", "\\b\\d{10}\\b", "UNIX", 1526400896000L);
validateTimestampMatch("2999999999.999", "NUMBER", "\\b\\d{10}\\b", "UNIX", 2999999999999L);
validateTimestampMatch("400000005afb078a164ac980", "BASE16NUM", "\\b[0-9A-Fa-f]{24}\\b", "TAI64N", 1526400896374L);
}