diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java index 04280261b26..a0e00ebf733 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java @@ -25,14 +25,15 @@ import java.util.regex.Pattern; */ public final class GrokPatternCreator { - private static String PREFACE = "preface"; - private static String EPILOGUE = "epilogue"; + private static final String PREFACE = "preface"; + private static final String EPILOGUE = "epilogue"; /** * The first match in this list will be chosen, so it needs to be ordered * such that more generic patterns come after more specific patterns. */ private static final List ORDERED_CANDIDATE_GROK_PATTERNS = Arrays.asList( + new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"), new GrokPatternCandidate("TIMESTAMP_ISO8601", "timestamp"), new GrokPatternCandidate("DATESTAMP_RFC822", "timestamp"), new GrokPatternCandidate("DATESTAMP_RFC2822", "timestamp"), @@ -41,7 +42,6 @@ public final class GrokPatternCreator { new GrokPatternCandidate("SYSLOGTIMESTAMP", "timestamp"), new GrokPatternCandidate("HTTPDATE", "timestamp"), new GrokPatternCandidate("CATALINA_DATESTAMP", "timestamp"), - new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"), new GrokPatternCandidate("CISCOTIMESTAMP", "timestamp"), new GrokPatternCandidate("DATE", "date"), new GrokPatternCandidate("TIME", "time"), @@ -56,12 +56,10 @@ public final class GrokPatternCreator { new GrokPatternCandidate("IP", "ipaddress"), // This already includes pre/post break conditions new GrokPatternCandidate("QUOTEDSTRING", "field", "", ""), - // Can't use \b as the break before, because it doesn't work for negative numbers (the - // minus sign is not a "word" character) - new GrokPatternCandidate("NUMBER", "field", "(? mustMatchStrings = Arrays.asList("2018-09-03 17:03:28,269 +0100 | ERROR | ", + "2018-09-03 17:04:27,279 +0100 | DEBUG | ", + "2018-09-03 17:05:26,289 +0100 | ERROR | "); + + Map fieldNameCountStore = new HashMap<>(); + StringBuilder overallGrokPatternBuilder = new StringBuilder(); + + GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + + assertEquals(".*?%{TOMCAT_DATESTAMP:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString()); + } + + public void testAppendBestGrokMatchForStringsGivenTrappyFloatCandidates() { + + // If we're not careful then we might detect the first part of these strings as a + // number, e.g. 1.2 in the first example, but this is inappropriate given the + // trailing dot and digit + Collection mustMatchStrings = Arrays.asList("1.2.3", + "-2.3.4", + "4.5.6.7", + "-9.8.7.6.5"); + + Map fieldNameCountStore = new HashMap<>(); + StringBuilder overallGrokPatternBuilder = new StringBuilder(); + + GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + + assertEquals(".+?", overallGrokPatternBuilder.toString()); + } + public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() { Collection mustMatchStrings = Arrays.asList("(-2)",