[ML] Add log structure finder functionality (#32788)

This change adds a library to ML that can be used to deduce a log
file's structure given only a sample of the log file.

Eventually this will be used to add an endpoint to ML to make the
functionality available to end users, but this will follow in a
separate change.

The functionality is split into a library so that it can also be
used by a command line tool without requiring the command line
tool to include all server code.
This commit is contained in:
David Roberts 2018-08-15 18:04:21 +01:00 committed by GitHub
parent 986c55b830
commit 5ba04e23fc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
42 changed files with 5744 additions and 0 deletions

View File

@ -0,0 +1,36 @@
import org.elasticsearch.gradle.precommit.PrecommitTasks
apply plugin: 'elasticsearch.build'
archivesBaseName = 'x-pack-log-structure-finder'
description = 'Common code for reverse engineering log structure'
dependencies {
compile "org.elasticsearch:elasticsearch-core:${version}"
compile "org.elasticsearch:elasticsearch-x-content:${version}"
compile project(':libs:grok')
compile "com.ibm.icu:icu4j:${versions.icu4j}"
compile "net.sf.supercsv:super-csv:${versions.supercsv}"
testCompile "org.elasticsearch.test:framework:${version}"
}
configurations {
testArtifacts.extendsFrom testRuntime
}
task testJar(type: Jar) {
appendix 'test'
from sourceSets.test.output
}
artifacts {
// normal es plugins do not publish the jar but we need to since users need it for Transport Clients and extensions
archives jar
testArtifacts testJar
}
forbiddenApisMain {
// log-structure-finder does not depend on server, so cannot forbid server methods
signaturesURLs = [PrecommitTasks.getResource('/forbidden/jdk-signatures.txt')]
}

View File

@ -0,0 +1 @@
7a4d00d5ec5febd252a6182e8b6e87a0a9821f81

View File

@ -0,0 +1,33 @@
ICU License - ICU 1.8.1 and later
COPYRIGHT AND PERMISSION NOTICE
Copyright (c) 1995-2012 International Business Machines Corporation and others
All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, and/or sell copies of the
Software, and to permit persons to whom the Software is furnished to do so,
provided that the above copyright notice(s) and this permission notice appear
in all copies of the Software and that both the above copyright notice(s) and
this permission notice appear in supporting documentation.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall not
be used in advertising or otherwise to promote the sale, use or other
dealings in this Software without prior written authorization of the
copyright holder.
All trademarks and registered trademarks mentioned herein are the property of
their respective owners.

View File

@ -0,0 +1,3 @@
ICU4J, (under lucene/analysis/icu) is licensed under an MIT style license
(modules/analysis/icu/lib/icu4j-LICENSE-BSD_LIKE.txt) and Copyright (c) 1995-2012
International Business Machines Corporation and others

View File

@ -0,0 +1 @@
017f8708c929029dde48bc298deaf3c7ae2452d3

View File

@ -0,0 +1,203 @@
/*
* Apache License
* Version 2.0, January 2004
* http://www.apache.org/licenses/
*
* TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
*
* 1. Definitions.
*
* "License" shall mean the terms and conditions for use, reproduction,
* and distribution as defined by Sections 1 through 9 of this document.
*
* "Licensor" shall mean the copyright owner or entity authorized by
* the copyright owner that is granting the License.
*
* "Legal Entity" shall mean the union of the acting entity and all
* other entities that control, are controlled by, or are under common
* control with that entity. For the purposes of this definition,
* "control" means (i) the power, direct or indirect, to cause the
* direction or management of such entity, whether by contract or
* otherwise, or (ii) ownership of fifty percent (50%) or more of the
* outstanding shares, or (iii) beneficial ownership of such entity.
*
* "You" (or "Your") shall mean an individual or Legal Entity
* exercising permissions granted by this License.
*
* "Source" form shall mean the preferred form for making modifications,
* including but not limited to software source code, documentation
* source, and configuration files.
*
* "Object" form shall mean any form resulting from mechanical
* transformation or translation of a Source form, including but
* not limited to compiled object code, generated documentation,
* and conversions to other media types.
*
* "Work" shall mean the work of authorship, whether in Source or
* Object form, made available under the License, as indicated by a
* copyright notice that is included in or attached to the work
* (an example is provided in the Appendix below).
*
* "Derivative Works" shall mean any work, whether in Source or Object
* form, that is based on (or derived from) the Work and for which the
* editorial revisions, annotations, elaborations, or other modifications
* represent, as a whole, an original work of authorship. For the purposes
* of this License, Derivative Works shall not include works that remain
* separable from, or merely link (or bind by name) to the interfaces of,
* the Work and Derivative Works thereof.
*
* "Contribution" shall mean any work of authorship, including
* the original version of the Work and any modifications or additions
* to that Work or Derivative Works thereof, that is intentionally
* submitted to Licensor for inclusion in the Work by the copyright owner
* or by an individual or Legal Entity authorized to submit on behalf of
* the copyright owner. For the purposes of this definition, "submitted"
* means any form of electronic, verbal, or written communication sent
* to the Licensor or its representatives, including but not limited to
* communication on electronic mailing lists, source code control systems,
* and issue tracking systems that are managed by, or on behalf of, the
* Licensor for the purpose of discussing and improving the Work, but
* excluding communication that is conspicuously marked or otherwise
* designated in writing by the copyright owner as "Not a Contribution."
*
* "Contributor" shall mean Licensor and any individual or Legal Entity
* on behalf of whom a Contribution has been received by Licensor and
* subsequently incorporated within the Work.
*
* 2. Grant of Copyright License. Subject to the terms and conditions of
* this License, each Contributor hereby grants to You a perpetual,
* worldwide, non-exclusive, no-charge, royalty-free, irrevocable
* copyright license to reproduce, prepare Derivative Works of,
* publicly display, publicly perform, sublicense, and distribute the
* Work and such Derivative Works in Source or Object form.
*
* 3. Grant of Patent License. Subject to the terms and conditions of
* this License, each Contributor hereby grants to You a perpetual,
* worldwide, non-exclusive, no-charge, royalty-free, irrevocable
* (except as stated in this section) patent license to make, have made,
* use, offer to sell, sell, import, and otherwise transfer the Work,
* where such license applies only to those patent claims licensable
* by such Contributor that are necessarily infringed by their
* Contribution(s) alone or by combination of their Contribution(s)
* with the Work to which such Contribution(s) was submitted. If You
* institute patent litigation against any entity (including a
* cross-claim or counterclaim in a lawsuit) alleging that the Work
* or a Contribution incorporated within the Work constitutes direct
* or contributory patent infringement, then any patent licenses
* granted to You under this License for that Work shall terminate
* as of the date such litigation is filed.
*
* 4. Redistribution. You may reproduce and distribute copies of the
* Work or Derivative Works thereof in any medium, with or without
* modifications, and in Source or Object form, provided that You
* meet the following conditions:
*
* (a) You must give any other recipients of the Work or
* Derivative Works a copy of this License; and
*
* (b) You must cause any modified files to carry prominent notices
* stating that You changed the files; and
*
* (c) You must retain, in the Source form of any Derivative Works
* that You distribute, all copyright, patent, trademark, and
* attribution notices from the Source form of the Work,
* excluding those notices that do not pertain to any part of
* the Derivative Works; and
*
* (d) If the Work includes a "NOTICE" text file as part of its
* distribution, then any Derivative Works that You distribute must
* include a readable copy of the attribution notices contained
* within such NOTICE file, excluding those notices that do not
* pertain to any part of the Derivative Works, in at least one
* of the following places: within a NOTICE text file distributed
* as part of the Derivative Works; within the Source form or
* documentation, if provided along with the Derivative Works; or,
* within a display generated by the Derivative Works, if and
* wherever such third-party notices normally appear. The contents
* of the NOTICE file are for informational purposes only and
* do not modify the License. You may add Your own attribution
* notices within Derivative Works that You distribute, alongside
* or as an addendum to the NOTICE text from the Work, provided
* that such additional attribution notices cannot be construed
* as modifying the License.
*
* You may add Your own copyright statement to Your modifications and
* may provide additional or different license terms and conditions
* for use, reproduction, or distribution of Your modifications, or
* for any such Derivative Works as a whole, provided Your use,
* reproduction, and distribution of the Work otherwise complies with
* the conditions stated in this License.
*
* 5. Submission of Contributions. Unless You explicitly state otherwise,
* any Contribution intentionally submitted for inclusion in the Work
* by You to the Licensor shall be under the terms and conditions of
* this License, without any additional terms or conditions.
* Notwithstanding the above, nothing herein shall supersede or modify
* the terms of any separate license agreement you may have executed
* with Licensor regarding such Contributions.
*
* 6. Trademarks. This License does not grant permission to use the trade
* names, trademarks, service marks, or product names of the Licensor,
* except as required for reasonable and customary use in describing the
* origin of the Work and reproducing the content of the NOTICE file.
*
* 7. Disclaimer of Warranty. Unless required by applicable law or
* agreed to in writing, Licensor provides the Work (and each
* Contributor provides its Contributions) on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied, including, without limitation, any warranties or conditions
* of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
* PARTICULAR PURPOSE. You are solely responsible for determining the
* appropriateness of using or redistributing the Work and assume any
* risks associated with Your exercise of permissions under this License.
*
* 8. Limitation of Liability. In no event and under no legal theory,
* whether in tort (including negligence), contract, or otherwise,
* unless required by applicable law (such as deliberate and grossly
* negligent acts) or agreed to in writing, shall any Contributor be
* liable to You for damages, including any direct, indirect, special,
* incidental, or consequential damages of any character arising as a
* result of this License or out of the use or inability to use the
* Work (including but not limited to damages for loss of goodwill,
* work stoppage, computer failure or malfunction, or any and all
* other commercial damages or losses), even if such Contributor
* has been advised of the possibility of such damages.
*
* 9. Accepting Warranty or Additional Liability. While redistributing
* the Work or Derivative Works thereof, You may choose to offer,
* and charge a fee for, acceptance of support, warranty, indemnity,
* or other liability obligations and/or rights consistent with this
* License. However, in accepting such obligations, You may act only
* on Your own behalf and on Your sole responsibility, not on behalf
* of any other Contributor, and only if You agree to indemnify,
* defend, and hold each Contributor harmless for any liability
* incurred by, or claims asserted against, such Contributor by reason
* of your accepting any such warranty or additional liability.
*
* END OF TERMS AND CONDITIONS
*
* APPENDIX: How to apply the Apache License to your work.
*
* To apply the Apache License to your work, attach the following
* boilerplate notice, with the fields enclosed by brackets "[]"
* replaced with your own identifying information. (Don't include
* the brackets!) The text should be enclosed in the appropriate
* comment syntax for the file format. We also recommend that a
* file or class name and description of purpose be included on the
* same "printed page" as the copyright notice for easier
* identification within third-party archives.
*
* Copyright 2007 Kasper B. Graversen
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

View File

@ -0,0 +1,35 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.supercsv.prefs.CsvPreference;
import java.io.IOException;
import java.util.List;
public class CsvLogStructureFinderFactory implements LogStructureFinderFactory {
/**
* Rules are:
* - The file must be valid CSV
* - It must contain at least two complete records
* - There must be at least two fields per record (otherwise files with no commas could be treated as CSV!)
* - Every CSV record except the last must have the same number of fields
* The reason the last record is allowed to have fewer fields than the others is that
* it could have been truncated when the file was sampled.
*/
@Override
public boolean canCreateFromSample(List<String> explanation, String sample) {
return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 2, CsvPreference.EXCEL_PREFERENCE, "CSV");
}
@Override
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
throws IOException {
return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
CsvPreference.EXCEL_PREFERENCE, false);
}
}

View File

@ -0,0 +1,615 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.grok.Grok;
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* Creates Grok patterns that will match all provided sample messages.
*
* The choice of field names is quite primitive. The intention is that a human will edit these.
*/
public final class GrokPatternCreator {
private static final Map<Character, Boolean> PUNCTUATION_OR_SPACE_NEEDS_ESCAPING;
static {
HashMap<Character, Boolean> punctuationOrSpaceNeedsEscaping = new HashMap<>();
String punctuationAndSpaceCharacters = "\"'`‘’“”#@%=\\/|~:;,<>()[]{}«»^$*¿?¡!§¶ \t\n";
String punctuationThatNeedsEscaping = "\\|()[]{}^$*?";
punctuationAndSpaceCharacters.chars()
.forEach(c -> punctuationOrSpaceNeedsEscaping.put((char) c, punctuationThatNeedsEscaping.indexOf(c) >= 0));
PUNCTUATION_OR_SPACE_NEEDS_ESCAPING = Collections.unmodifiableMap(punctuationOrSpaceNeedsEscaping);
}
private static final String PREFACE = "preface";
private static final String VALUE = "value";
private static final String EPILOGUE = "epilogue";
/**
* Grok patterns that are designed to match the whole message, not just a part of it.
*/
private static final List<FullMatchGrokPatternCandidate> FULL_MATCH_GROK_PATTERNS = Arrays.asList(
new FullMatchGrokPatternCandidate("BACULA_LOGLINE", "bts"),
new FullMatchGrokPatternCandidate("CATALINALOG", "timestamp"),
new FullMatchGrokPatternCandidate("COMBINEDAPACHELOG", "timestamp"),
new FullMatchGrokPatternCandidate("COMMONAPACHELOG", "timestamp"),
new FullMatchGrokPatternCandidate("ELB_ACCESS_LOG", "timestamp"),
new FullMatchGrokPatternCandidate("HAPROXYHTTP", "syslog_timestamp"),
new FullMatchGrokPatternCandidate("HAPROXYTCP", "syslog_timestamp"),
new FullMatchGrokPatternCandidate("HTTPD20_ERRORLOG", "timestamp"),
new FullMatchGrokPatternCandidate("HTTPD24_ERRORLOG", "timestamp"),
new FullMatchGrokPatternCandidate("NAGIOSLOGLINE", "nagios_epoch"),
new FullMatchGrokPatternCandidate("NETSCREENSESSIONLOG", "date"),
new FullMatchGrokPatternCandidate("RAILS3", "timestamp"),
new FullMatchGrokPatternCandidate("RUBY_LOGGER", "timestamp"),
new FullMatchGrokPatternCandidate("SHOREWALL", "timestamp"),
new FullMatchGrokPatternCandidate("TOMCATLOG", "timestamp")
);
/**
* The first match in this list will be chosen, so it needs to be ordered
* such that more generic patterns come after more specific patterns.
*/
private static final List<GrokPatternCandidate> ORDERED_CANDIDATE_GROK_PATTERNS = Arrays.asList(
new ValueOnlyGrokPatternCandidate("TOMCAT_DATESTAMP", "date", "extra_timestamp"),
new ValueOnlyGrokPatternCandidate("TIMESTAMP_ISO8601", "date", "extra_timestamp"),
new ValueOnlyGrokPatternCandidate("DATESTAMP_RFC822", "date", "extra_timestamp"),
new ValueOnlyGrokPatternCandidate("DATESTAMP_RFC2822", "date", "extra_timestamp"),
new ValueOnlyGrokPatternCandidate("DATESTAMP_OTHER", "date", "extra_timestamp"),
new ValueOnlyGrokPatternCandidate("DATESTAMP_EVENTLOG", "date", "extra_timestamp"),
new ValueOnlyGrokPatternCandidate("SYSLOGTIMESTAMP", "date", "extra_timestamp"),
new ValueOnlyGrokPatternCandidate("HTTPDATE", "date", "extra_timestamp"),
new ValueOnlyGrokPatternCandidate("CATALINA_DATESTAMP", "date", "extra_timestamp"),
new ValueOnlyGrokPatternCandidate("CISCOTIMESTAMP", "date", "extra_timestamp"),
new ValueOnlyGrokPatternCandidate("LOGLEVEL", "keyword", "loglevel"),
new ValueOnlyGrokPatternCandidate("URI", "keyword", "uri"),
new ValueOnlyGrokPatternCandidate("UUID", "keyword", "uuid"),
new ValueOnlyGrokPatternCandidate("MAC", "keyword", "macaddress"),
// Can't use \b as the breaks, because slashes are not "word" characters
new ValueOnlyGrokPatternCandidate("PATH", "keyword", "path", "(?<!\\w)", "(?!\\w)"),
new ValueOnlyGrokPatternCandidate("EMAILADDRESS", "keyword", "email"),
// TODO: would be nice to have IPORHOST here, but HOST matches almost all words
new ValueOnlyGrokPatternCandidate("IP", "ip", "ipaddress"),
new ValueOnlyGrokPatternCandidate("DATE", "date", "date"),
new ValueOnlyGrokPatternCandidate("TIME", "date", "time"),
// This already includes pre/post break conditions
new ValueOnlyGrokPatternCandidate("QUOTEDSTRING", "keyword", "field", "", ""),
// Disallow +, - and . before numbers, as well as "word" characters, otherwise we'll pick
// up numeric suffices too eagerly
new ValueOnlyGrokPatternCandidate("INT", "long", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\d)"),
new ValueOnlyGrokPatternCandidate("NUMBER", "double", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\d)"),
new ValueOnlyGrokPatternCandidate("BASE16NUM", "keyword", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\w)")
// TODO: also unfortunately can't have USERNAME in the list as it matches too broadly
// Fixing these problems with overly broad matches would require some extra intelligence
// to be added to remove inappropriate matches. One idea would be to use a dictionary,
// but that doesn't necessarily help as "jay" could be a username but is also a dictionary
// word (plus there's the international headache with relying on dictionaries). Similarly,
// hostnames could also be dictionary words - I've worked on machines called "hippo" and
// "scarf" in the past. Another idea would be to look at the adjacent characters and
// apply some heuristic based on those.
);
/**
* It is expected that the explanation will be shared with other code.
* Both this class and other classes will update it.
*/
private final List<String> explanation;
private final Collection<String> sampleMessages;
/**
* It is expected that the mappings will be shared with other code.
* Both this class and other classes will update it.
*/
private final Map<String, Object> mappings;
private final Map<String, Integer> fieldNameCountStore = new HashMap<>();
private final StringBuilder overallGrokPatternBuilder = new StringBuilder();
/**
*
* @param explanation List of reasons for making decisions. May contain items when passed and new reasons
* can be appended by the methods of this class.
* @param sampleMessages Sample messages that any Grok pattern found must match.
* @param mappings Will be updated with mappings appropriate for the returned pattern, if non-<code>null</code>.
*/
public GrokPatternCreator(List<String> explanation, Collection<String> sampleMessages, Map<String, Object> mappings) {
this.explanation = explanation;
this.sampleMessages = Collections.unmodifiableCollection(sampleMessages);
this.mappings = mappings;
}
/**
* This method attempts to find a Grok pattern that will match all of the sample messages in their entirety.
* @return A tuple of (time field name, Grok string), or <code>null</code> if no suitable Grok pattern was found.
*/
public Tuple<String, String> findFullLineGrokPattern() {
for (FullMatchGrokPatternCandidate candidate : FULL_MATCH_GROK_PATTERNS) {
if (candidate.matchesAll(sampleMessages)) {
return candidate.processMatch(explanation, sampleMessages, mappings);
}
}
return null;
}
/**
* Build a Grok pattern that will match all of the sample messages in their entirety.
* @param seedPatternName A pattern that has already been determined to match some portion of every sample message.
* @param seedFieldName The field name to be used for the portion of every sample message that the seed pattern matches.
* @return The built Grok pattern.
*/
public String createGrokPatternFromExamples(String seedPatternName, String seedFieldName) {
overallGrokPatternBuilder.setLength(0);
GrokPatternCandidate seedCandidate = new NoMappingGrokPatternCandidate(seedPatternName, seedFieldName);
processCandidateAndSplit(seedCandidate, true, sampleMessages, false, 0, false, 0);
return overallGrokPatternBuilder.toString().replace("\t", "\\t").replace("\n", "\\n");
}
/**
* This is purely to allow unit tests to inspect the partial Grok pattern after testing implementation details.
* It should not be used in production code.
*/
StringBuilder getOverallGrokPatternBuilder() {
return overallGrokPatternBuilder;
}
/**
* Given a chosen Grok pattern and a collection of message snippets, split the snippets into the
* matched section and the pieces before and after it. Recurse to find more matches in the pieces
* before and after and update the supplied string builder.
*/
private void processCandidateAndSplit(GrokPatternCandidate chosenPattern, boolean isLast, Collection<String> snippets,
boolean ignoreKeyValueCandidateLeft, int ignoreValueOnlyCandidatesLeft,
boolean ignoreKeyValueCandidateRight, int ignoreValueOnlyCandidatesRight) {
Collection<String> prefaces = new ArrayList<>();
Collection<String> epilogues = new ArrayList<>();
String patternBuilderContent = chosenPattern.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, mappings);
appendBestGrokMatchForStrings(false, prefaces, ignoreKeyValueCandidateLeft, ignoreValueOnlyCandidatesLeft);
overallGrokPatternBuilder.append(patternBuilderContent);
appendBestGrokMatchForStrings(isLast, epilogues, ignoreKeyValueCandidateRight, ignoreValueOnlyCandidatesRight);
}
/**
* Given a collection of message snippets, work out which (if any) of the Grok patterns we're allowed
* to use matches it best. Then append the appropriate Grok language to represent that finding onto
* the supplied string builder.
*/
void appendBestGrokMatchForStrings(boolean isLast, Collection<String> snippets,
boolean ignoreKeyValueCandidate, int ignoreValueOnlyCandidates) {
snippets = adjustForPunctuation(snippets);
GrokPatternCandidate bestCandidate = null;
if (snippets.isEmpty() == false) {
GrokPatternCandidate kvCandidate = new KeyValueGrokPatternCandidate(explanation);
if (ignoreKeyValueCandidate == false && kvCandidate.matchesAll(snippets)) {
bestCandidate = kvCandidate;
} else {
ignoreKeyValueCandidate = true;
for (GrokPatternCandidate candidate :
ORDERED_CANDIDATE_GROK_PATTERNS.subList(ignoreValueOnlyCandidates, ORDERED_CANDIDATE_GROK_PATTERNS.size())) {
if (candidate.matchesAll(snippets)) {
bestCandidate = candidate;
break;
}
++ignoreValueOnlyCandidates;
}
}
}
if (bestCandidate == null) {
if (isLast) {
finalizeGrokPattern(snippets);
} else {
addIntermediateRegex(snippets);
}
} else {
processCandidateAndSplit(bestCandidate, isLast, snippets, true, ignoreValueOnlyCandidates + (ignoreKeyValueCandidate ? 1 : 0),
ignoreKeyValueCandidate, ignoreValueOnlyCandidates);
}
}
/**
* If the snippets supplied begin with more than 1 character of common punctuation or whitespace
* then add all but the last of these characters to the overall pattern and remove them from the
* snippets.
* @param snippets Input snippets - not modified.
* @return Output snippets, which will be a copy of the input snippets but with whatever characters
* were added to <code>overallPatternBuilder</code> removed from the beginning.
*/
Collection<String> adjustForPunctuation(Collection<String> snippets) {
assert snippets.isEmpty() == false;
StringBuilder commonInitialPunctuation = new StringBuilder();
for (String snippet : snippets) {
if (commonInitialPunctuation.length() == 0) {
for (int index = 0; index < snippet.length(); ++index) {
char ch = snippet.charAt(index);
if (PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch) != null) {
commonInitialPunctuation.append(ch);
} else {
break;
}
}
} else {
if (commonInitialPunctuation.length() > snippet.length()) {
commonInitialPunctuation.delete(snippet.length(), commonInitialPunctuation.length());
}
for (int index = 0; index < commonInitialPunctuation.length(); ++index) {
char ch = snippet.charAt(index);
if (ch != commonInitialPunctuation.charAt(index)) {
commonInitialPunctuation.delete(index, commonInitialPunctuation.length());
break;
}
}
}
if (commonInitialPunctuation.length() <= 1) {
return snippets;
}
}
int numLiteralCharacters = commonInitialPunctuation.length() - 1;
for (int index = 0; index < numLiteralCharacters; ++index) {
char ch = commonInitialPunctuation.charAt(index);
if (PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.getOrDefault(ch, false)) {
overallGrokPatternBuilder.append('\\');
}
overallGrokPatternBuilder.append(ch);
}
return snippets.stream().map(snippet -> snippet.substring(numLiteralCharacters)).collect(Collectors.toList());
}
/**
* The first time a particular field name is passed, simply return it.
* The second time return it with "2" appended.
* The third time return it with "3" appended.
* Etc.
*/
static String buildFieldName(Map<String, Integer> fieldNameCountStore, String fieldName) {
Integer numberSeen = fieldNameCountStore.compute(fieldName, (k, v) -> 1 + ((v == null) ? 0 : v));
return (numberSeen > 1) ? fieldName + numberSeen : fieldName;
}
private void addIntermediateRegex(Collection<String> snippets) {
addIntermediateRegex(overallGrokPatternBuilder, snippets);
}
public static void addIntermediateRegex(StringBuilder patternBuilder, Collection<String> snippets) {
if (snippets.isEmpty()) {
return;
}
List<String> others = new ArrayList<>(snippets);
String driver = others.remove(others.size() - 1);
boolean wildcardRequiredIfNonMatchFound = true;
for (int i = 0; i < driver.length(); ++i) {
char ch = driver.charAt(i);
Boolean punctuationOrSpaceNeedsEscaping = PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch);
if (punctuationOrSpaceNeedsEscaping != null && others.stream().allMatch(other -> other.indexOf(ch) >= 0)) {
if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(other -> other.indexOf(ch) > 0)) {
patternBuilder.append(".*?");
}
if (punctuationOrSpaceNeedsEscaping) {
patternBuilder.append('\\');
}
patternBuilder.append(ch);
wildcardRequiredIfNonMatchFound = true;
others = others.stream().map(other -> other.substring(other.indexOf(ch) + 1)).collect(Collectors.toList());
} else if (wildcardRequiredIfNonMatchFound) {
patternBuilder.append(".*?");
wildcardRequiredIfNonMatchFound = false;
}
}
if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(s -> s.isEmpty() == false)) {
patternBuilder.append(".*?");
}
}
private void finalizeGrokPattern(Collection<String> snippets) {
if (snippets.stream().allMatch(String::isEmpty)) {
return;
}
List<String> others = new ArrayList<>(snippets);
String driver = others.remove(others.size() - 1);
for (int i = 0; i < driver.length(); ++i) {
char ch = driver.charAt(i);
int driverIndex = i;
Boolean punctuationOrSpaceNeedsEscaping = PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch);
if (punctuationOrSpaceNeedsEscaping != null &&
others.stream().allMatch(other -> other.length() > driverIndex && other.charAt(driverIndex) == ch)) {
if (punctuationOrSpaceNeedsEscaping) {
overallGrokPatternBuilder.append('\\');
}
overallGrokPatternBuilder.append(ch);
if (i == driver.length() - 1 && others.stream().allMatch(driver::equals)) {
return;
}
} else {
break;
}
}
overallGrokPatternBuilder.append(".*");
}
interface GrokPatternCandidate {
/**
* @return Does this Grok pattern candidate match all the snippets?
*/
boolean matchesAll(Collection<String> snippets);
/**
* After it has been determined that this Grok pattern candidate matches a collection of strings,
* return collections of the bits that come before (prefaces) and after (epilogues) the bit
* that matches. Also update mappings with the most appropriate field name and type.
* @return The string that needs to be incorporated into the overall Grok pattern for the line.
*/
String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
Collection<String> epilogues, Map<String, Object> mappings);
}
/**
* A Grok pattern candidate that will match a single named Grok pattern.
*/
static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate {
private final String grokPatternName;
private final String mappingType;
private final String fieldName;
private final Grok grok;
/**
* Pre/post breaks default to \b, but this may not be appropriate for Grok patterns that start or
* end with a non "word" character (i.e. letter, number or underscore). For such patterns use one
* of the other constructors.
* <p>
* In cases where the Grok pattern defined by Logstash already includes conditions on what must
* come before and after the match, use one of the other constructors and specify an empty string
* for the pre and/or post breaks.
*
* @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
* @param fieldName Name of the field to extract from the match.
*/
ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName) {
this(grokPatternName, mappingType, fieldName, "\\b", "\\b");
}
/**
* @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
* @param mappingType Data type for field in Elasticsearch mappings.
* @param fieldName Name of the field to extract from the match.
* @param preBreak Only consider the match if it's broken from the previous text by this.
* @param postBreak Only consider the match if it's broken from the following text by this.
*/
ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName, String preBreak, String postBreak) {
this.grokPatternName = grokPatternName;
this.mappingType = mappingType;
this.fieldName = fieldName;
// The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java
grok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + preBreak +
"%{" + grokPatternName + ":" + VALUE + "}" + postBreak + "%{GREEDYDATA:" + EPILOGUE + "}");
}
@Override
public boolean matchesAll(Collection<String> snippets) {
return snippets.stream().allMatch(grok::match);
}
/**
* Given a collection of strings, and a Grok pattern that matches some part of them all,
* return collections of the bits that come before (prefaces) and after (epilogues) the
* bit that matches.
*/
@Override
public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
Collection<String> epilogues, Map<String, Object> mappings) {
String sampleValue = null;
for (String snippet : snippets) {
Map<String, Object> captures = grok.captures(snippet);
// If the pattern doesn't match then captures will be null
if (captures == null) {
throw new IllegalStateException("[%{" + grokPatternName + "}] does not match snippet [" + snippet + "]");
}
prefaces.add(captures.getOrDefault(PREFACE, "").toString());
if (sampleValue == null) {
sampleValue = captures.get(VALUE).toString();
}
epilogues.add(captures.getOrDefault(EPILOGUE, "").toString());
}
String adjustedFieldName = buildFieldName(fieldNameCountStore, fieldName);
if (mappings != null) {
Map<String, String> fullMappingType = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, mappingType);
if ("date".equals(mappingType)) {
TimestampMatch timestampMatch = TimestampFormatFinder.findFirstFullMatch(sampleValue);
if (timestampMatch != null) {
fullMappingType = timestampMatch.getEsDateMappingTypeWithFormat();
}
}
mappings.put(adjustedFieldName, fullMappingType);
}
return "%{" + grokPatternName + ":" + adjustedFieldName + "}";
}
}
/**
* Unlike the {@link ValueOnlyGrokPatternCandidate} an object of this class is not immutable and not thread safe.
* When a given object matches a set of strings it chooses a field name. Then that same field name is used when
* processing captures from the pattern. Hence only a single thread may use any particular instance of this
* class.
*/
static class KeyValueGrokPatternCandidate implements GrokPatternCandidate {
private static final Pattern kvFinder = Pattern.compile("\\b(\\w+)=[\\w.-]+");
private final List<String> explanation;
private String fieldName;
KeyValueGrokPatternCandidate(List<String> explanation) {
this.explanation = explanation;
}
@Override
public boolean matchesAll(Collection<String> snippets) {
Set<String> candidateNames = new LinkedHashSet<>();
boolean isFirst = true;
for (String snippet : snippets) {
if (isFirst) {
Matcher matcher = kvFinder.matcher(snippet);
while (matcher.find()) {
candidateNames.add(matcher.group(1));
}
isFirst = false;
} else {
candidateNames.removeIf(candidateName ->
Pattern.compile("\\b" + candidateName + "=[\\w.-]+").matcher(snippet).find() == false);
}
if (candidateNames.isEmpty()) {
break;
}
}
return (fieldName = candidateNames.stream().findFirst().orElse(null)) != null;
}
@Override
public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
Collection<String> epilogues, Map<String, Object> mappings) {
if (fieldName == null) {
throw new IllegalStateException("Cannot process KV matches until a field name has been determined");
}
Grok grok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}\\b" +
fieldName + "=%{USER:" + VALUE + "}%{GREEDYDATA:" + EPILOGUE + "}");
Collection<String> values = new ArrayList<>();
for (String snippet : snippets) {
Map<String, Object> captures = grok.captures(snippet);
// If the pattern doesn't match then captures will be null
if (captures == null) {
throw new IllegalStateException("[\\b" + fieldName + "=%{USER}] does not match snippet [" + snippet + "]");
}
prefaces.add(captures.getOrDefault(PREFACE, "").toString());
values.add(captures.getOrDefault(VALUE, "").toString());
epilogues.add(captures.getOrDefault(EPILOGUE, "").toString());
}
String adjustedFieldName = buildFieldName(fieldNameCountStore, fieldName);
if (mappings != null) {
mappings.put(adjustedFieldName, LogStructureUtils.guessScalarMapping(explanation, adjustedFieldName, values));
}
return "\\b" + fieldName + "=%{USER:" + adjustedFieldName + "}";
}
}
/**
* A Grok pattern candidate that matches a single named Grok pattern but will not update mappings.
*/
static class NoMappingGrokPatternCandidate extends ValueOnlyGrokPatternCandidate {
NoMappingGrokPatternCandidate(String grokPatternName, String fieldName) {
super(grokPatternName, null, fieldName);
}
@Override
public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
Collection<String> epilogues, Map<String, Object> mappings) {
return super.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, null);
}
}
/**
* Used to check whether a single Grok pattern matches every sample message in its entirety.
*/
static class FullMatchGrokPatternCandidate {
private final String grokString;
private final String timeField;
private final Grok grok;
FullMatchGrokPatternCandidate(String grokPatternName, String timeField) {
grokString = "%{" + grokPatternName + "}";
this.timeField = timeField;
grok = new Grok(Grok.getBuiltinPatterns(), grokString);
}
public boolean matchesAll(Collection<String> sampleMessages) {
return sampleMessages.stream().allMatch(grok::match);
}
/**
* This must only be called if {@link #matchesAll} returns <code>true</code>.
* @return A tuple of (time field name, Grok string).
*/
public Tuple<String, String> processMatch(List<String> explanation, Collection<String> sampleMessages,
Map<String, Object> mappings) {
explanation.add("A full message Grok pattern [" + grokString.substring(2, grokString.length() - 1) + "] looks appropriate");
if (mappings != null) {
Map<String, Collection<String>> valuesPerField = new HashMap<>();
for (String sampleMessage : sampleMessages) {
Map<String, Object> captures = grok.captures(sampleMessage);
// If the pattern doesn't match then captures will be null
if (captures == null) {
throw new IllegalStateException("[" + grokString + "] does not match snippet [" + sampleMessage + "]");
}
for (Map.Entry<String, Object> capture : captures.entrySet()) {
String fieldName = capture.getKey();
String fieldValue = capture.getValue().toString();
// Exclude the time field because that will be dropped and replaced with @timestamp
if (fieldName.equals(timeField) == false) {
valuesPerField.compute(fieldName, (k, v) -> {
if (v == null) {
return new ArrayList<>(Collections.singletonList(fieldValue));
} else {
v.add(fieldValue);
return v;
}
});
}
}
}
for (Map.Entry<String, Collection<String>> valuesForField : valuesPerField.entrySet()) {
String fieldName = valuesForField.getKey();
mappings.put(fieldName,
LogStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue()));
}
}
return new Tuple<>(timeField, grokString);
}
}
}

View File

@ -0,0 +1,84 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.xcontent.DeprecationHandler;
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.stream.Collectors;
import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
/**
* Really ND-JSON.
*/
public class JsonLogStructureFinder implements LogStructureFinder {
private final List<String> sampleMessages;
private final LogStructure structure;
static JsonLogStructureFinder makeJsonLogStructureFinder(List<String> explanation, String sample, String charsetName,
Boolean hasByteOrderMarker) throws IOException {
List<Map<String, ?>> sampleRecords = new ArrayList<>();
List<String> sampleMessages = Arrays.asList(sample.split("\n"));
for (String sampleMessage : sampleMessages) {
XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY, DeprecationHandler.THROW_UNSUPPORTED_OPERATION,
sampleMessage);
sampleRecords.add(parser.mapOrdered());
}
LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.JSON)
.setCharset(charsetName)
.setHasByteOrderMarker(hasByteOrderMarker)
.setSampleStart(sampleMessages.stream().limit(2).collect(Collectors.joining("\n", "", "\n")))
.setNumLinesAnalyzed(sampleMessages.size())
.setNumMessagesAnalyzed(sampleRecords.size());
Tuple<String, TimestampMatch> timeField = LogStructureUtils.guessTimestampField(explanation, sampleRecords);
if (timeField != null) {
structureBuilder.setTimestampField(timeField.v1())
.setTimestampFormats(timeField.v2().dateFormats)
.setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing());
}
SortedMap<String, Object> mappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
LogStructure structure = structureBuilder
.setMappings(mappings)
.setExplanation(explanation)
.build();
return new JsonLogStructureFinder(sampleMessages, structure);
}
private JsonLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
this.sampleMessages = Collections.unmodifiableList(sampleMessages);
this.structure = structure;
}
@Override
public List<String> getSampleMessages() {
return sampleMessages;
}
@Override
public LogStructure getStructure() {
return structure;
}
}

View File

@ -0,0 +1,87 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.xcontent.DeprecationHandler;
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
import org.elasticsearch.common.xcontent.XContentParser;
import java.io.IOException;
import java.io.StringReader;
import java.util.List;
import java.util.Locale;
import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
public class JsonLogStructureFinderFactory implements LogStructureFinderFactory {
/**
* This format matches if the sample consists of one or more JSON documents.
* If there is more than one, they must be newline-delimited. The
* documents must be non-empty, to prevent lines containing "{}" from matching.
*/
@Override
public boolean canCreateFromSample(List<String> explanation, String sample) {
int completeDocCount = 0;
try {
String[] sampleLines = sample.split("\n");
for (String sampleLine : sampleLines) {
try (XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY,
DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader(sampleLine))) {
if (parser.map().isEmpty()) {
explanation.add("Not JSON because an empty object was parsed: [" + sampleLine + "]");
return false;
}
++completeDocCount;
if (parser.nextToken() != null) {
explanation.add("Not newline delimited JSON because a line contained more than a single object: [" +
sampleLine + "]");
return false;
}
}
}
} catch (IOException | IllegalStateException e) {
explanation.add("Not JSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
return false;
}
if (completeDocCount == 0) {
explanation.add("Not JSON because sample didn't contain a complete document");
return false;
}
explanation.add("Deciding sample is newline delimited JSON");
return true;
}
@Override
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
throws IOException {
return JsonLogStructureFinder.makeJsonLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker);
}
private static class ContextPrintingStringReader extends StringReader {
private final String str;
ContextPrintingStringReader(String str) {
super(str);
this.str = str;
}
@Override
public String toString() {
if (str.length() <= 80) {
return String.format(Locale.ROOT, "\"%s\"", str);
} else {
return String.format(Locale.ROOT, "\"%.77s...\"", str);
}
}
}
}

View File

@ -0,0 +1,614 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.xcontent.ObjectParser;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.SortedMap;
import java.util.TreeMap;
/**
* Stores the log file format determined by a {@link LogStructureFinder}.
*/
public class LogStructure implements ToXContentObject {
public enum Format {
JSON, XML, CSV, TSV, SEMI_COLON_SEPARATED_VALUES, PIPE_SEPARATED_VALUES, SEMI_STRUCTURED_TEXT;
public Character separator() {
switch (this) {
case JSON:
case XML:
return null;
case CSV:
return ',';
case TSV:
return '\t';
case SEMI_COLON_SEPARATED_VALUES:
return ';';
case PIPE_SEPARATED_VALUES:
return '|';
case SEMI_STRUCTURED_TEXT:
return null;
default:
throw new IllegalStateException("enum value [" + this + "] missing from switch.");
}
}
public boolean supportsNesting() {
switch (this) {
case JSON:
case XML:
return true;
case CSV:
case TSV:
case SEMI_COLON_SEPARATED_VALUES:
case PIPE_SEPARATED_VALUES:
case SEMI_STRUCTURED_TEXT:
return false;
default:
throw new IllegalStateException("enum value [" + this + "] missing from switch.");
}
}
public boolean isStructured() {
switch (this) {
case JSON:
case XML:
case CSV:
case TSV:
case SEMI_COLON_SEPARATED_VALUES:
case PIPE_SEPARATED_VALUES:
return true;
case SEMI_STRUCTURED_TEXT:
return false;
default:
throw new IllegalStateException("enum value [" + this + "] missing from switch.");
}
}
public boolean isSemiStructured() {
switch (this) {
case JSON:
case XML:
case CSV:
case TSV:
case SEMI_COLON_SEPARATED_VALUES:
case PIPE_SEPARATED_VALUES:
return false;
case SEMI_STRUCTURED_TEXT:
return true;
default:
throw new IllegalStateException("enum value [" + this + "] missing from switch.");
}
}
public boolean isSeparatedValues() {
switch (this) {
case JSON:
case XML:
return false;
case CSV:
case TSV:
case SEMI_COLON_SEPARATED_VALUES:
case PIPE_SEPARATED_VALUES:
return true;
case SEMI_STRUCTURED_TEXT:
return false;
default:
throw new IllegalStateException("enum value [" + this + "] missing from switch.");
}
}
public static Format fromSeparator(char separator) {
switch (separator) {
case ',':
return CSV;
case '\t':
return TSV;
case ';':
return SEMI_COLON_SEPARATED_VALUES;
case '|':
return PIPE_SEPARATED_VALUES;
default:
throw new IllegalArgumentException("No known format has separator [" + separator + "]");
}
}
public static Format fromString(String name) {
return valueOf(name.trim().toUpperCase(Locale.ROOT));
}
@Override
public String toString() {
return name().toLowerCase(Locale.ROOT);
}
}
static final ParseField NUM_LINES_ANALYZED = new ParseField("num_lines_analyzed");
static final ParseField NUM_MESSAGES_ANALYZED = new ParseField("num_messages_analyzed");
static final ParseField SAMPLE_START = new ParseField("sample_start");
static final ParseField CHARSET = new ParseField("charset");
static final ParseField HAS_BYTE_ORDER_MARKER = new ParseField("has_byte_order_marker");
static final ParseField STRUCTURE = new ParseField("format");
static final ParseField MULTILINE_START_PATTERN = new ParseField("multiline_start_pattern");
static final ParseField EXCLUDE_LINES_PATTERN = new ParseField("exclude_lines_pattern");
static final ParseField INPUT_FIELDS = new ParseField("input_fields");
static final ParseField HAS_HEADER_ROW = new ParseField("has_header_row");
static final ParseField SEPARATOR = new ParseField("separator");
static final ParseField SHOULD_TRIM_FIELDS = new ParseField("should_trim_fields");
static final ParseField GROK_PATTERN = new ParseField("grok_pattern");
static final ParseField TIMESTAMP_FIELD = new ParseField("timestamp_field");
static final ParseField TIMESTAMP_FORMATS = new ParseField("timestamp_formats");
static final ParseField NEED_CLIENT_TIMEZONE = new ParseField("need_client_timezone");
static final ParseField MAPPINGS = new ParseField("mappings");
static final ParseField EXPLANATION = new ParseField("explanation");
public static final ObjectParser<Builder, Void> PARSER = new ObjectParser<>("log_file_structure", false, Builder::new);
static {
PARSER.declareInt(Builder::setNumLinesAnalyzed, NUM_LINES_ANALYZED);
PARSER.declareInt(Builder::setNumMessagesAnalyzed, NUM_MESSAGES_ANALYZED);
PARSER.declareString(Builder::setSampleStart, SAMPLE_START);
PARSER.declareString(Builder::setCharset, CHARSET);
PARSER.declareBoolean(Builder::setHasByteOrderMarker, HAS_BYTE_ORDER_MARKER);
PARSER.declareString((p, c) -> p.setFormat(Format.fromString(c)), STRUCTURE);
PARSER.declareString(Builder::setMultilineStartPattern, MULTILINE_START_PATTERN);
PARSER.declareString(Builder::setExcludeLinesPattern, EXCLUDE_LINES_PATTERN);
PARSER.declareStringArray(Builder::setInputFields, INPUT_FIELDS);
PARSER.declareBoolean(Builder::setHasHeaderRow, HAS_HEADER_ROW);
PARSER.declareString((p, c) -> p.setSeparator(c.charAt(0)), SEPARATOR);
PARSER.declareBoolean(Builder::setShouldTrimFields, SHOULD_TRIM_FIELDS);
PARSER.declareString(Builder::setGrokPattern, GROK_PATTERN);
PARSER.declareString(Builder::setTimestampField, TIMESTAMP_FIELD);
PARSER.declareStringArray(Builder::setTimestampFormats, TIMESTAMP_FORMATS);
PARSER.declareBoolean(Builder::setNeedClientTimezone, NEED_CLIENT_TIMEZONE);
PARSER.declareObject(Builder::setMappings, (p, c) -> new TreeMap<>(p.map()), MAPPINGS);
PARSER.declareStringArray(Builder::setExplanation, EXPLANATION);
}
private final int numLinesAnalyzed;
private final int numMessagesAnalyzed;
private final String sampleStart;
private final String charset;
private final Boolean hasByteOrderMarker;
private final Format format;
private final String multilineStartPattern;
private final String excludeLinesPattern;
private final List<String> inputFields;
private final Boolean hasHeaderRow;
private final Character separator;
private final Boolean shouldTrimFields;
private final String grokPattern;
private final List<String> timestampFormats;
private final String timestampField;
private final boolean needClientTimezone;
private final SortedMap<String, Object> mappings;
private final List<String> explanation;
public LogStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampleStart, String charset, Boolean hasByteOrderMarker,
Format format, String multilineStartPattern, String excludeLinesPattern, List<String> inputFields,
Boolean hasHeaderRow, Character separator, Boolean shouldTrimFields, String grokPattern, String timestampField,
List<String> timestampFormats, boolean needClientTimezone, Map<String, Object> mappings,
List<String> explanation) {
this.numLinesAnalyzed = numLinesAnalyzed;
this.numMessagesAnalyzed = numMessagesAnalyzed;
this.sampleStart = Objects.requireNonNull(sampleStart);
this.charset = Objects.requireNonNull(charset);
this.hasByteOrderMarker = hasByteOrderMarker;
this.format = Objects.requireNonNull(format);
this.multilineStartPattern = multilineStartPattern;
this.excludeLinesPattern = excludeLinesPattern;
this.inputFields = (inputFields == null) ? null : Collections.unmodifiableList(new ArrayList<>(inputFields));
this.hasHeaderRow = hasHeaderRow;
this.separator = separator;
this.shouldTrimFields = shouldTrimFields;
this.grokPattern = grokPattern;
this.timestampField = timestampField;
this.timestampFormats = (timestampFormats == null) ? null : Collections.unmodifiableList(new ArrayList<>(timestampFormats));
this.needClientTimezone = needClientTimezone;
this.mappings = Collections.unmodifiableSortedMap(new TreeMap<>(mappings));
this.explanation = Collections.unmodifiableList(new ArrayList<>(explanation));
}
public int getNumLinesAnalyzed() {
return numLinesAnalyzed;
}
public int getNumMessagesAnalyzed() {
return numMessagesAnalyzed;
}
public String getSampleStart() {
return sampleStart;
}
public String getCharset() {
return charset;
}
public Boolean getHasByteOrderMarker() {
return hasByteOrderMarker;
}
public Format getFormat() {
return format;
}
public String getMultilineStartPattern() {
return multilineStartPattern;
}
public String getExcludeLinesPattern() {
return excludeLinesPattern;
}
public List<String> getInputFields() {
return inputFields;
}
public Boolean getHasHeaderRow() {
return hasHeaderRow;
}
public Character getSeparator() {
return separator;
}
public Boolean getShouldTrimFields() {
return shouldTrimFields;
}
public String getGrokPattern() {
return grokPattern;
}
public String getTimestampField() {
return timestampField;
}
public List<String> getTimestampFormats() {
return timestampFormats;
}
public boolean needClientTimezone() {
return needClientTimezone;
}
public SortedMap<String, Object> getMappings() {
return mappings;
}
public List<String> getExplanation() {
return explanation;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
builder.field(NUM_LINES_ANALYZED.getPreferredName(), numLinesAnalyzed);
builder.field(NUM_MESSAGES_ANALYZED.getPreferredName(), numMessagesAnalyzed);
builder.field(SAMPLE_START.getPreferredName(), sampleStart);
builder.field(CHARSET.getPreferredName(), charset);
if (hasByteOrderMarker != null) {
builder.field(HAS_BYTE_ORDER_MARKER.getPreferredName(), hasByteOrderMarker.booleanValue());
}
builder.field(STRUCTURE.getPreferredName(), format);
if (multilineStartPattern != null && multilineStartPattern.isEmpty() == false) {
builder.field(MULTILINE_START_PATTERN.getPreferredName(), multilineStartPattern);
}
if (excludeLinesPattern != null && excludeLinesPattern.isEmpty() == false) {
builder.field(EXCLUDE_LINES_PATTERN.getPreferredName(), excludeLinesPattern);
}
if (inputFields != null && inputFields.isEmpty() == false) {
builder.field(INPUT_FIELDS.getPreferredName(), inputFields);
}
if (hasHeaderRow != null) {
builder.field(HAS_HEADER_ROW.getPreferredName(), hasHeaderRow.booleanValue());
}
if (separator != null) {
builder.field(SEPARATOR.getPreferredName(), String.valueOf(separator));
}
if (shouldTrimFields != null) {
builder.field(SHOULD_TRIM_FIELDS.getPreferredName(), shouldTrimFields.booleanValue());
}
if (grokPattern != null && grokPattern.isEmpty() == false) {
builder.field(GROK_PATTERN.getPreferredName(), grokPattern);
}
if (timestampField != null && timestampField.isEmpty() == false) {
builder.field(TIMESTAMP_FIELD.getPreferredName(), timestampField);
}
if (timestampFormats != null && timestampFormats.isEmpty() == false) {
builder.field(TIMESTAMP_FORMATS.getPreferredName(), timestampFormats);
}
builder.field(NEED_CLIENT_TIMEZONE.getPreferredName(), needClientTimezone);
builder.field(MAPPINGS.getPreferredName(), mappings);
builder.field(EXPLANATION.getPreferredName(), explanation);
builder.endObject();
return builder;
}
@Override
public int hashCode() {
return Objects.hash(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
multilineStartPattern, excludeLinesPattern, inputFields, hasHeaderRow, separator, shouldTrimFields, grokPattern, timestampField,
timestampFormats, needClientTimezone, mappings, explanation);
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (other == null || getClass() != other.getClass()) {
return false;
}
LogStructure that = (LogStructure) other;
return this.numLinesAnalyzed == that.numLinesAnalyzed &&
this.numMessagesAnalyzed == that.numMessagesAnalyzed &&
this.needClientTimezone == that.needClientTimezone &&
Objects.equals(this.sampleStart, that.sampleStart) &&
Objects.equals(this.charset, that.charset) &&
Objects.equals(this.hasByteOrderMarker, that.hasByteOrderMarker) &&
Objects.equals(this.format, that.format) &&
Objects.equals(this.multilineStartPattern, that.multilineStartPattern) &&
Objects.equals(this.excludeLinesPattern, that.excludeLinesPattern) &&
Objects.equals(this.inputFields, that.inputFields) &&
Objects.equals(this.hasHeaderRow, that.hasHeaderRow) &&
Objects.equals(this.separator, that.separator) &&
Objects.equals(this.shouldTrimFields, that.shouldTrimFields) &&
Objects.equals(this.grokPattern, that.grokPattern) &&
Objects.equals(this.timestampField, that.timestampField) &&
Objects.equals(this.timestampFormats, that.timestampFormats) &&
Objects.equals(this.mappings, that.mappings) &&
Objects.equals(this.explanation, that.explanation);
}
public static class Builder {
private int numLinesAnalyzed;
private int numMessagesAnalyzed;
private String sampleStart;
private String charset;
private Boolean hasByteOrderMarker;
private Format format;
private String multilineStartPattern;
private String excludeLinesPattern;
private List<String> inputFields;
private Boolean hasHeaderRow;
private Character separator;
private Boolean shouldTrimFields;
private String grokPattern;
private String timestampField;
private List<String> timestampFormats;
private boolean needClientTimezone;
private Map<String, Object> mappings;
private List<String> explanation;
public Builder() {
this(Format.SEMI_STRUCTURED_TEXT);
}
public Builder(Format format) {
setFormat(format);
}
public Builder setNumLinesAnalyzed(int numLinesAnalyzed) {
this.numLinesAnalyzed = numLinesAnalyzed;
return this;
}
public Builder setNumMessagesAnalyzed(int numMessagesAnalyzed) {
this.numMessagesAnalyzed = numMessagesAnalyzed;
return this;
}
public Builder setSampleStart(String sampleStart) {
this.sampleStart = Objects.requireNonNull(sampleStart);
return this;
}
public Builder setCharset(String charset) {
this.charset = Objects.requireNonNull(charset);
return this;
}
public Builder setHasByteOrderMarker(Boolean hasByteOrderMarker) {
this.hasByteOrderMarker = hasByteOrderMarker;
return this;
}
public Builder setFormat(Format format) {
this.format = Objects.requireNonNull(format);
this.separator = format.separator();
return this;
}
public Builder setMultilineStartPattern(String multilineStartPattern) {
this.multilineStartPattern = multilineStartPattern;
return this;
}
public Builder setExcludeLinesPattern(String excludeLinesPattern) {
this.excludeLinesPattern = excludeLinesPattern;
return this;
}
public Builder setInputFields(List<String> inputFields) {
this.inputFields = inputFields;
return this;
}
public Builder setHasHeaderRow(Boolean hasHeaderRow) {
this.hasHeaderRow = hasHeaderRow;
return this;
}
public Builder setShouldTrimFields(Boolean shouldTrimFields) {
this.shouldTrimFields = shouldTrimFields;
return this;
}
public Builder setSeparator(Character separator) {
this.separator = separator;
return this;
}
public Builder setGrokPattern(String grokPattern) {
this.grokPattern = grokPattern;
return this;
}
public Builder setTimestampField(String timestampField) {
this.timestampField = timestampField;
return this;
}
public Builder setTimestampFormats(List<String> timestampFormats) {
this.timestampFormats = timestampFormats;
return this;
}
public Builder setNeedClientTimezone(boolean needClientTimezone) {
this.needClientTimezone = needClientTimezone;
return this;
}
public Builder setMappings(Map<String, Object> mappings) {
this.mappings = Objects.requireNonNull(mappings);
return this;
}
public Builder setExplanation(List<String> explanation) {
this.explanation = Objects.requireNonNull(explanation);
return this;
}
@SuppressWarnings("fallthrough")
public LogStructure build() {
if (numLinesAnalyzed <= 0) {
throw new IllegalArgumentException("Number of lines analyzed must be positive.");
}
if (numMessagesAnalyzed <= 0) {
throw new IllegalArgumentException("Number of messages analyzed must be positive.");
}
if (numMessagesAnalyzed > numLinesAnalyzed) {
throw new IllegalArgumentException("Number of messages analyzed cannot be greater than number of lines analyzed.");
}
if (sampleStart == null || sampleStart.isEmpty()) {
throw new IllegalArgumentException("Sample start must be specified.");
}
if (charset == null || charset.isEmpty()) {
throw new IllegalArgumentException("A character set must be specified.");
}
if (charset.toUpperCase(Locale.ROOT).startsWith("UTF") == false && hasByteOrderMarker != null) {
throw new IllegalArgumentException("A byte order marker is only possible for UTF character sets.");
}
switch (format) {
case JSON:
if (shouldTrimFields != null) {
throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
}
// $FALL-THROUGH$
case XML:
if (hasHeaderRow != null) {
throw new IllegalArgumentException("Has header row may not be specified for [" + format + "] structures.");
}
if (separator != null) {
throw new IllegalArgumentException("Separator may not be specified for [" + format + "] structures.");
}
if (grokPattern != null) {
throw new IllegalArgumentException("Grok pattern may not be specified for [" + format + "] structures.");
}
break;
case CSV:
case TSV:
case SEMI_COLON_SEPARATED_VALUES:
case PIPE_SEPARATED_VALUES:
if (inputFields == null || inputFields.isEmpty()) {
throw new IllegalArgumentException("Input fields must be specified for [" + format + "] structures.");
}
if (hasHeaderRow == null) {
throw new IllegalArgumentException("Has header row must be specified for [" + format + "] structures.");
}
Character expectedSeparator = format.separator();
assert expectedSeparator != null;
if (expectedSeparator.equals(separator) == false) {
throw new IllegalArgumentException("Separator must be [" + expectedSeparator + "] for [" + format +
"] structures.");
}
if (grokPattern != null) {
throw new IllegalArgumentException("Grok pattern may not be specified for [" + format + "] structures.");
}
break;
case SEMI_STRUCTURED_TEXT:
if (inputFields != null) {
throw new IllegalArgumentException("Input fields may not be specified for [" + format + "] structures.");
}
if (hasHeaderRow != null) {
throw new IllegalArgumentException("Has header row may not be specified for [" + format + "] structures.");
}
if (separator != null) {
throw new IllegalArgumentException("Separator may not be specified for [" + format + "] structures.");
}
if (shouldTrimFields != null) {
throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
}
if (grokPattern == null || grokPattern.isEmpty()) {
throw new IllegalArgumentException("Grok pattern must be specified for [" + format + "] structures.");
}
break;
default:
throw new IllegalStateException("enum value [" + format + "] missing from switch.");
}
if ((timestampField == null) != (timestampFormats == null || timestampFormats.isEmpty())) {
throw new IllegalArgumentException("Timestamp field and timestamp formats must both be specified or neither be specified.");
}
if (needClientTimezone && timestampField == null) {
throw new IllegalArgumentException("Client timezone cannot be needed if there is no timestamp field.");
}
if (mappings == null || mappings.isEmpty()) {
throw new IllegalArgumentException("Mappings must be specified.");
}
if (explanation == null || explanation.isEmpty()) {
throw new IllegalArgumentException("Explanation must be specified.");
}
return new LogStructure(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
multilineStartPattern, excludeLinesPattern, inputFields, hasHeaderRow, separator, shouldTrimFields, grokPattern,
timestampField, timestampFormats, needClientTimezone, mappings, explanation);
}
}
}

View File

@ -0,0 +1,23 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import java.util.List;
public interface LogStructureFinder {
/**
* The (possibly multi-line) messages that the log sample was split into.
* @return A list of messages.
*/
List<String> getSampleMessages();
/**
* Retrieve the structure of the log file used to instantiate the finder.
* @return The log file structure.
*/
LogStructure getStructure();
}

View File

@ -0,0 +1,35 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import java.util.List;
public interface LogStructureFinderFactory {
/**
* Given a sample of a log file, decide whether this factory will be able
* to create an appropriate object to represent its ingestion configs.
* @param explanation List of reasons for making decisions. May contain items when passed and new reasons
* can be appended by this method.
* @param sample A sample from the log file to be ingested.
* @return <code>true</code> if this factory can create an appropriate log
* file structure given the sample; otherwise <code>false</code>.
*/
boolean canCreateFromSample(List<String> explanation, String sample);
/**
* Create an object representing the structure of a log file.
* @param explanation List of reasons for making decisions. May contain items when passed and new reasons
* can be appended by this method.
* @param sample A sample from the log file to be ingested.
* @param charsetName The name of the character set in which the sample was provided.
* @param hasByteOrderMarker Did the sample have a byte order marker? <code>null</code> means "not relevant".
* @return A log file structure object suitable for ingesting the supplied sample.
* @throws Exception if something goes wrong during creation.
*/
LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
throws Exception;
}

View File

@ -0,0 +1,232 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import org.elasticsearch.common.collect.Tuple;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Optional;
import java.util.Set;
/**
* Runs the high-level steps needed to create ingest configs for the specified log file. In order:
* 1. Determine the most likely character set (UTF-8, UTF-16LE, ISO-8859-2, etc.)
* 2. Load a sample of the file, consisting of the first 1000 lines of the file
* 3. Determine the most likely file structure - one of ND-JSON, XML, CSV, TSV or semi-structured text
* 4. Create an appropriate structure object and delegate writing configs to it
*/
public final class LogStructureFinderManager {
public static final int MIN_SAMPLE_LINE_COUNT = 2;
static final Set<String> FILEBEAT_SUPPORTED_ENCODINGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
"866", "ansi_x3.4-1968", "arabic", "ascii", "asmo-708", "big5", "big5-hkscs", "chinese", "cn-big5", "cp1250", "cp1251", "cp1252",
"cp1253", "cp1254", "cp1255", "cp1256", "cp1257", "cp1258", "cp819", "cp866", "csbig5", "cseuckr", "cseucpkdfmtjapanese",
"csgb2312", "csibm866", "csiso2022jp", "csiso2022kr", "csiso58gb231280", "csiso88596e", "csiso88596i", "csiso88598e", "csiso88598i",
"csisolatin1", "csisolatin2", "csisolatin3", "csisolatin4", "csisolatin5", "csisolatin6", "csisolatin9", "csisolatinarabic",
"csisolatincyrillic", "csisolatingreek", "csisolatinhebrew", "cskoi8r", "csksc56011987", "csmacintosh", "csshiftjis", "cyrillic",
"dos-874", "ecma-114", "ecma-118", "elot_928", "euc-jp", "euc-kr", "gb18030", "gb2312", "gb_2312", "gb_2312-80", "gbk", "greek",
"greek8", "hebrew", "hz-gb-2312", "ibm819", "ibm866", "iso-2022-cn", "iso-2022-cn-ext", "iso-2022-jp", "iso-2022-kr", "iso-8859-1",
"iso-8859-10", "iso-8859-11", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "iso-8859-2", "iso-8859-3", "iso-8859-4",
"iso-8859-5", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", "iso-8859-7", "iso-8859-8", "iso-8859-8-e", "iso-8859-8-i",
"iso-8859-9", "iso-ir-100", "iso-ir-101", "iso-ir-109", "iso-ir-110", "iso-ir-126", "iso-ir-127", "iso-ir-138", "iso-ir-144",
"iso-ir-148", "iso-ir-149", "iso-ir-157", "iso-ir-58", "iso8859-1", "iso8859-10", "iso8859-11", "iso8859-13", "iso8859-14",
"iso8859-15", "iso8859-2", "iso8859-3", "iso8859-4", "iso8859-5", "iso8859-6", "iso8859-6e", "iso8859-6i", "iso8859-7", "iso8859-8",
"iso8859-8e", "iso8859-8i", "iso8859-9", "iso88591", "iso885910", "iso885911", "iso885913", "iso885914", "iso885915", "iso88592",
"iso88593", "iso88594", "iso88595", "iso88596", "iso88597", "iso88598", "iso88599", "iso_8859-1", "iso_8859-15", "iso_8859-1:1987",
"iso_8859-2", "iso_8859-2:1987", "iso_8859-3", "iso_8859-3:1988", "iso_8859-4", "iso_8859-4:1988", "iso_8859-5", "iso_8859-5:1988",
"iso_8859-6", "iso_8859-6:1987", "iso_8859-7", "iso_8859-7:1987", "iso_8859-8", "iso_8859-8:1988", "iso_8859-9", "iso_8859-9:1989",
"koi", "koi8", "koi8-r", "koi8-ru", "koi8-u", "koi8_r", "korean", "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "l1",
"l2", "l3", "l4", "l5", "l6", "l9", "latin1", "latin2", "latin3", "latin4", "latin5", "latin6", "logical", "mac", "macintosh",
"ms932", "ms_kanji", "shift-jis", "shift_jis", "sjis", "sun_eu_greek", "tis-620", "unicode-1-1-utf-8", "us-ascii", "utf-16",
"utf-16-bom", "utf-16be", "utf-16be-bom", "utf-16le", "utf-16le-bom", "utf-8", "utf8", "visual", "windows-1250", "windows-1251",
"windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "windows-31j",
"windows-874", "windows-949", "x-cp1250", "x-cp1251", "x-cp1252", "x-cp1253", "x-cp1254", "x-cp1255", "x-cp1256", "x-cp1257",
"x-cp1258", "x-euc-jp", "x-gbk", "x-mac-cyrillic", "x-mac-roman", "x-mac-ukrainian", "x-sjis", "x-x-big5"
)));
/**
* These need to be ordered so that the more generic formats come after the more specific ones
*/
private static final List<LogStructureFinderFactory> ORDERED_STRUCTURE_FACTORIES = Collections.unmodifiableList(Arrays.asList(
new JsonLogStructureFinderFactory(),
new XmlLogStructureFinderFactory(),
// ND-JSON will often also be valid (although utterly weird) CSV, so JSON must come before CSV
new CsvLogStructureFinderFactory(),
new TsvLogStructureFinderFactory(),
new SemiColonSeparatedValuesLogStructureFinderFactory(),
new PipeSeparatedValuesLogStructureFinderFactory(),
new TextLogStructureFinderFactory()
));
private static final int BUFFER_SIZE = 8192;
/**
* Given a stream of data from some log file, determine its structure.
* @param idealSampleLineCount Ideally, how many lines from the stream will be read to determine the structure?
* If the stream has fewer lines then an attempt will still be made, providing at
* least {@link #MIN_SAMPLE_LINE_COUNT} lines can be read.
* @param fromFile A stream from which the sample will be read.
* @return A {@link LogStructureFinder} object from which the structure and messages can be queried.
* @throws Exception A variety of problems could occur at various stages of the structure finding process.
*/
public LogStructureFinder findLogStructure(int idealSampleLineCount, InputStream fromFile) throws Exception {
return findLogStructure(new ArrayList<>(), idealSampleLineCount, fromFile);
}
public LogStructureFinder findLogStructure(List<String> explanation, int idealSampleLineCount, InputStream fromFile)
throws Exception {
CharsetMatch charsetMatch = findCharset(explanation, fromFile);
String charsetName = charsetMatch.getName();
Tuple<String, Boolean> sampleInfo = sampleFile(charsetMatch.getReader(), charsetName, MIN_SAMPLE_LINE_COUNT,
Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount));
return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2());
}
CharsetMatch findCharset(List<String> explanation, InputStream inputStream) throws Exception {
// We need an input stream that supports mark and reset, so wrap the argument
// in a BufferedInputStream if it doesn't already support this feature
if (inputStream.markSupported() == false) {
inputStream = new BufferedInputStream(inputStream, BUFFER_SIZE);
}
// This is from ICU4J
CharsetDetector charsetDetector = new CharsetDetector().setText(inputStream);
CharsetMatch[] charsetMatches = charsetDetector.detectAll();
// Determine some extra characteristics of the input to compensate for some deficiencies of ICU4J
boolean pureAscii = true;
boolean containsZeroBytes = false;
inputStream.mark(BUFFER_SIZE);
byte[] workspace = new byte[BUFFER_SIZE];
int remainingLength = BUFFER_SIZE;
do {
int bytesRead = inputStream.read(workspace, 0, remainingLength);
if (bytesRead <= 0) {
break;
}
for (int i = 0; i < bytesRead && containsZeroBytes == false; ++i) {
if (workspace[i] == 0) {
containsZeroBytes = true;
pureAscii = false;
} else {
pureAscii = pureAscii && workspace[i] > 0 && workspace[i] < 128;
}
}
remainingLength -= bytesRead;
} while (containsZeroBytes == false && remainingLength > 0);
inputStream.reset();
if (pureAscii) {
// If the input is pure ASCII then many single byte character sets will match. We want to favour
// UTF-8 in this case, as it avoids putting a bold declaration of a dubious character set choice
// in the config files.
Optional<CharsetMatch> utf8CharsetMatch = Arrays.stream(charsetMatches)
.filter(charsetMatch -> StandardCharsets.UTF_8.name().equals(charsetMatch.getName())).findFirst();
if (utf8CharsetMatch.isPresent()) {
explanation.add("Using character encoding [" + StandardCharsets.UTF_8.name() +
"], which matched the input with [" + utf8CharsetMatch.get().getConfidence() + "%] confidence - first [" +
(BUFFER_SIZE / 1024) + "kB] of input was pure ASCII");
return utf8CharsetMatch.get();
}
}
// Input wasn't pure ASCII, so use the best matching character set that's supported by both Java and Go.
// Additionally, if the input contains zero bytes then avoid single byte character sets, as ICU4J will
// suggest these for binary files but then
for (CharsetMatch charsetMatch : charsetMatches) {
String name = charsetMatch.getName();
if (Charset.isSupported(name) && FILEBEAT_SUPPORTED_ENCODINGS.contains(name.toLowerCase(Locale.ROOT))) {
// This extra test is to avoid trying to read binary files as text. Running the log config
// deduction algorithms on binary files is very slow as the binary files generally appear to
// have very long lines.
boolean spaceEncodingContainsZeroByte = false;
byte[] spaceBytes = " ".getBytes(name);
for (int i = 0; i < spaceBytes.length && spaceEncodingContainsZeroByte == false; ++i) {
spaceEncodingContainsZeroByte = (spaceBytes[i] == 0);
}
if (containsZeroBytes && spaceEncodingContainsZeroByte == false) {
explanation.add("Character encoding [" + name + "] matched the input with [" + charsetMatch.getConfidence() +
"%] confidence but was rejected as the input contains zero bytes and the [" + name + "] encoding does not");
} else {
explanation.add("Using character encoding [" + name + "], which matched the input with [" +
charsetMatch.getConfidence() + "%] confidence");
return charsetMatch;
}
} else {
explanation.add("Character encoding [" + name + "] matched the input with [" + charsetMatch.getConfidence() +
"%] confidence but was rejected as it is not supported by [" +
(Charset.isSupported(name) ? "Filebeat" : "the JVM") + "]");
}
}
throw new IllegalArgumentException("Could not determine a usable character encoding for the input" +
(containsZeroBytes ? " - could it be binary data?" : ""));
}
LogStructureFinder makeBestStructureFinder(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
throws Exception {
for (LogStructureFinderFactory factory : ORDERED_STRUCTURE_FACTORIES) {
if (factory.canCreateFromSample(explanation, sample)) {
return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker);
}
}
throw new IllegalArgumentException("Input did not match any known formats");
}
private Tuple<String, Boolean> sampleFile(Reader reader, String charsetName, int minLines, int maxLines) throws IOException {
int lineCount = 0;
BufferedReader bufferedReader = new BufferedReader(reader);
StringBuilder sample = new StringBuilder();
// Don't include any byte-order-marker in the sample. (The logic to skip it works for both
// UTF-8 and UTF-16 assuming the character set of the reader was correctly detected.)
Boolean hasByteOrderMarker = null;
if (charsetName.toUpperCase(Locale.ROOT).startsWith("UTF")) {
int maybeByteOrderMarker = reader.read();
hasByteOrderMarker = ((char) maybeByteOrderMarker == '\uFEFF');
if (maybeByteOrderMarker >= 0 && hasByteOrderMarker == false && (char) maybeByteOrderMarker != '\r')
{
sample.appendCodePoint(maybeByteOrderMarker);
if ((char) maybeByteOrderMarker == '\n') {
++lineCount;
}
}
}
String line;
while ((line = bufferedReader.readLine()) != null && ++lineCount <= maxLines) {
sample.append(line).append('\n');
}
if (lineCount < minLines) {
throw new IllegalArgumentException("Input contained too few lines to sample");
}
return new Tuple<>(sample.toString(), hasByteOrderMarker);
}
}

View File

@ -0,0 +1,238 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.grok.Grok;
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.stream.Collectors;
import java.util.stream.Stream;
final class LogStructureUtils {
static final String DEFAULT_TIMESTAMP_FIELD = "@timestamp";
static final String MAPPING_TYPE_SETTING = "type";
static final String MAPPING_FORMAT_SETTING = "format";
static final String MAPPING_PROPERTIES_SETTING = "properties";
// NUMBER Grok pattern doesn't support scientific notation, so we extend it
private static final Grok NUMBER_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{NUMBER}(?:[eE][+-]?[0-3]?[0-9]{1,2})?$");
private static final Grok IP_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{IP}$");
private static final int KEYWORD_MAX_LEN = 256;
private static final int KEYWORD_MAX_SPACES = 5;
private LogStructureUtils() {
}
/**
* Given one or more sample records, find a timestamp field that is consistently present in them all.
* To be returned the timestamp field:
* - Must exist in every record
* - Must have the same timestamp format in every record
* If multiple fields meet these criteria then the one that occurred first in the first sample record
* is chosen.
* @param explanation List of reasons for choosing the overall log structure. This list
* may be non-empty when the method is called, and this method may
* append to it.
* @param sampleRecords List of records derived from the provided log sample.
* @return A tuple of (field name, timestamp format) if one can be found, or <code>null</code> if
* there is no consistent timestamp.
*/
static Tuple<String, TimestampMatch> guessTimestampField(List<String> explanation, List<Map<String, ?>> sampleRecords) {
if (sampleRecords.isEmpty()) {
return null;
}
// Accept the first match from the first sample that is compatible with all the other samples
for (Tuple<String, TimestampMatch> candidate : findCandidates(explanation, sampleRecords)) {
boolean allGood = true;
for (Map<String, ?> sampleRecord : sampleRecords.subList(1, sampleRecords.size())) {
Object fieldValue = sampleRecord.get(candidate.v1());
if (fieldValue == null) {
explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord +
"] doesn't have field");
allGood = false;
break;
}
TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(fieldValue.toString());
if (match == null || match.candidateIndex != candidate.v2().candidateIndex) {
explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord +
"] matches differently: [" + match + "]");
allGood = false;
break;
}
}
if (allGood) {
explanation.add("Guessing timestamp field is [" + candidate.v1() + "] with format [" + candidate.v2() + "]");
return candidate;
}
}
return null;
}
private static List<Tuple<String, TimestampMatch>> findCandidates(List<String> explanation, List<Map<String, ?>> sampleRecords) {
List<Tuple<String, TimestampMatch>> candidates = new ArrayList<>();
// Get candidate timestamps from the first sample record
for (Map.Entry<String, ?> entry : sampleRecords.get(0).entrySet()) {
Object value = entry.getValue();
if (value != null) {
TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(value.toString());
if (match != null) {
Tuple<String, TimestampMatch> candidate = new Tuple<>(entry.getKey(), match);
candidates.add(candidate);
explanation.add("First sample timestamp match [" + candidate + "]");
}
}
}
return candidates;
}
/**
* Given the sampled records, guess appropriate Elasticsearch mappings.
* @param sampleRecords The sampled records.
* @return A map of field name to mapping settings.
*/
static SortedMap<String, Object> guessMappings(List<String> explanation, List<Map<String, ?>> sampleRecords) {
SortedMap<String, Object> mappings = new TreeMap<>();
for (Map<String, ?> sampleRecord : sampleRecords) {
for (String fieldName : sampleRecord.keySet()) {
mappings.computeIfAbsent(fieldName, key -> guessMapping(explanation, fieldName,
sampleRecords.stream().flatMap(record -> {
Object fieldValue = record.get(fieldName);
return (fieldValue == null) ? Stream.empty() : Stream.of(fieldValue);
}
).collect(Collectors.toList())));
}
}
return mappings;
}
static Map<String, String> guessMapping(List<String> explanation, String fieldName, List<Object> fieldValues) {
if (fieldValues == null || fieldValues.isEmpty()) {
// We can get here if all the records that contained a given field had a null value for it.
// In this case it's best not to make any statement about what the mapping type should be.
return null;
}
if (fieldValues.stream().anyMatch(value -> value instanceof Map)) {
if (fieldValues.stream().allMatch(value -> value instanceof Map)) {
return Collections.singletonMap(MAPPING_TYPE_SETTING, "object");
}
throw new IllegalArgumentException("Field [" + fieldName +
"] has both object and non-object values - this is not supported by Elasticsearch");
}
if (fieldValues.stream().anyMatch(value -> value instanceof List || value instanceof Object[])) {
// Elasticsearch fields can be either arrays or single values, but array values must all have the same type
return guessMapping(explanation, fieldName,
fieldValues.stream().flatMap(LogStructureUtils::flatten).collect(Collectors.toList()));
}
return guessScalarMapping(explanation, fieldName, fieldValues.stream().map(Object::toString).collect(Collectors.toList()));
}
private static Stream<Object> flatten(Object value) {
if (value instanceof List) {
@SuppressWarnings("unchecked")
List<Object> objectList = (List<Object>) value;
return objectList.stream();
} else if (value instanceof Object[]) {
return Arrays.stream((Object[]) value);
} else {
return Stream.of(value);
}
}
/**
* Given some sample values for a field, guess the most appropriate index mapping for the
* field.
* @param explanation List of reasons for choosing the overall log structure. This list
* may be non-empty when the method is called, and this method may
* append to it.
* @param fieldName Name of the field for which mappings are to be guessed.
* @param fieldValues Values of the field for which mappings are to be guessed. The guessed
* mapping will be compatible with all the provided values. Must not be
* empty.
* @return The sub-section of the index mappings most appropriate for the field,
* for example <code>{ "type" : "keyword" }</code>.
*/
static Map<String, String> guessScalarMapping(List<String> explanation, String fieldName, Collection<String> fieldValues) {
assert fieldValues.isEmpty() == false;
if (fieldValues.stream().allMatch(value -> "true".equals(value) || "false".equals(value))) {
return Collections.singletonMap(MAPPING_TYPE_SETTING, "boolean");
}
// This checks if a date mapping would be appropriate, and, if so, finds the correct format
Iterator<String> iter = fieldValues.iterator();
TimestampMatch timestampMatch = TimestampFormatFinder.findFirstFullMatch(iter.next());
while (timestampMatch != null && iter.hasNext()) {
// To be mapped as type date all the values must match the same date format - it is
// not acceptable for all values to be dates, but with different formats
if (timestampMatch.equals(TimestampFormatFinder.findFirstFullMatch(iter.next(), timestampMatch.candidateIndex)) == false) {
timestampMatch = null;
}
}
if (timestampMatch != null) {
return timestampMatch.getEsDateMappingTypeWithFormat();
}
if (fieldValues.stream().allMatch(NUMBER_GROK::match)) {
try {
fieldValues.forEach(Long::parseLong);
return Collections.singletonMap(MAPPING_TYPE_SETTING, "long");
} catch (NumberFormatException e) {
explanation.add("Rejecting type 'long' for field [" + fieldName + "] due to parse failure: [" + e.getMessage() + "]");
}
try {
fieldValues.forEach(Double::parseDouble);
return Collections.singletonMap(MAPPING_TYPE_SETTING, "double");
} catch (NumberFormatException e) {
explanation.add("Rejecting type 'double' for field [" + fieldName + "] due to parse failure: [" + e.getMessage() + "]");
}
}
else if (fieldValues.stream().allMatch(IP_GROK::match)) {
return Collections.singletonMap(MAPPING_TYPE_SETTING, "ip");
}
if (fieldValues.stream().anyMatch(LogStructureUtils::isMoreLikelyTextThanKeyword)) {
return Collections.singletonMap(MAPPING_TYPE_SETTING, "text");
}
return Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword");
}
/**
* The thinking is that the longer the field value and the more spaces it contains,
* the more likely it is that it should be indexed as text rather than keyword.
*/
static boolean isMoreLikelyTextThanKeyword(String str) {
int length = str.length();
return length > KEYWORD_MAX_LEN || length - str.replaceAll("\\s", "").length() > KEYWORD_MAX_SPACES;
}
}

View File

@ -0,0 +1,38 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.supercsv.prefs.CsvPreference;
import java.io.IOException;
import java.util.List;
public class PipeSeparatedValuesLogStructureFinderFactory implements LogStructureFinderFactory {
private static final CsvPreference PIPE_PREFERENCE = new CsvPreference.Builder('"', '|', "\n").build();
/**
* Rules are:
* - The file must be valid pipe (<code>|</code>) separated values
* - It must contain at least two complete records
* - There must be at least five fields per record (otherwise files with coincidental
* or no pipe characters could be treated as pipe separated)
* - Every pipe separated value record except the last must have the same number of fields
* The reason the last record is allowed to have fewer fields than the others is that
* it could have been truncated when the file was sampled.
*/
@Override
public boolean canCreateFromSample(List<String> explanation, String sample) {
return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 5, PIPE_PREFERENCE, "pipe separated values");
}
@Override
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
throws IOException {
return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
PIPE_PREFERENCE, true);
}
}

View File

@ -0,0 +1,37 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.supercsv.prefs.CsvPreference;
import java.io.IOException;
import java.util.List;
public class SemiColonSeparatedValuesLogStructureFinderFactory implements LogStructureFinderFactory {
/**
* Rules are:
* - The file must be valid semi-colon separated values
* - It must contain at least two complete records
* - There must be at least four fields per record (otherwise files with coincidental
* or no semi-colons could be treated as semi-colon separated)
* - Every semi-colon separated value record except the last must have the same number of fields
* The reason the last record is allowed to have fewer fields than the others is that
* it could have been truncated when the file was sampled.
*/
@Override
public boolean canCreateFromSample(List<String> explanation, String sample) {
return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 4,
CsvPreference.EXCEL_NORTH_EUROPE_PREFERENCE, "semi-colon separated values");
}
@Override
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
throws IOException {
return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
CsvPreference.EXCEL_NORTH_EUROPE_PREFERENCE, false);
}
}

View File

@ -0,0 +1,486 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
import org.supercsv.exception.SuperCsvException;
import org.supercsv.io.CsvListReader;
import org.supercsv.prefs.CsvPreference;
import org.supercsv.util.Util;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.DoubleSummaryStatistics;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Random;
import java.util.SortedMap;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
public class SeparatedValuesLogStructureFinder implements LogStructureFinder {
private static final int MAX_LEVENSHTEIN_COMPARISONS = 100;
private final List<String> sampleMessages;
private final LogStructure structure;
static SeparatedValuesLogStructureFinder makeSeparatedValuesLogStructureFinder(List<String> explanation, String sample,
String charsetName, Boolean hasByteOrderMarker,
CsvPreference csvPreference, boolean trimFields)
throws IOException {
Tuple<List<List<String>>, List<Integer>> parsed = readRows(sample, csvPreference);
List<List<String>> rows = parsed.v1();
List<Integer> lineNumbers = parsed.v2();
Tuple<Boolean, String[]> headerInfo = findHeaderFromSample(explanation, rows);
boolean isHeaderInFile = headerInfo.v1();
String[] header = headerInfo.v2();
String[] headerWithNamedBlanks = new String[header.length];
for (int i = 0; i < header.length; ++i) {
String rawHeader = header[i].isEmpty() ? "column" + (i + 1) : header[i];
headerWithNamedBlanks[i] = trimFields ? rawHeader.trim() : rawHeader;
}
List<String> sampleLines = Arrays.asList(sample.split("\n"));
List<String> sampleMessages = new ArrayList<>();
List<Map<String, ?>> sampleRecords = new ArrayList<>();
int prevMessageEndLineNumber = isHeaderInFile ? lineNumbers.get(0) : -1;
for (int index = isHeaderInFile ? 1 : 0; index < rows.size(); ++index) {
List<String> row = rows.get(index);
int lineNumber = lineNumbers.get(index);
Map<String, String> sampleRecord = new LinkedHashMap<>();
Util.filterListToMap(sampleRecord, headerWithNamedBlanks,
trimFields ? row.stream().map(String::trim).collect(Collectors.toList()) : row);
sampleRecords.add(sampleRecord);
sampleMessages.add(
sampleLines.subList(prevMessageEndLineNumber + 1, lineNumbers.get(index)).stream().collect(Collectors.joining("\n")));
prevMessageEndLineNumber = lineNumber;
}
String preamble = Pattern.compile("\n").splitAsStream(sample).limit(lineNumbers.get(1)).collect(Collectors.joining("\n", "", "\n"));
char delimiter = (char) csvPreference.getDelimiterChar();
LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.fromSeparator(delimiter))
.setCharset(charsetName)
.setHasByteOrderMarker(hasByteOrderMarker)
.setSampleStart(preamble)
.setNumLinesAnalyzed(lineNumbers.get(lineNumbers.size() - 1))
.setNumMessagesAnalyzed(sampleRecords.size())
.setHasHeaderRow(isHeaderInFile)
.setInputFields(Arrays.stream(headerWithNamedBlanks).collect(Collectors.toList()));
if (trimFields) {
structureBuilder.setShouldTrimFields(true);
}
Tuple<String, TimestampMatch> timeField = LogStructureUtils.guessTimestampField(explanation, sampleRecords);
if (timeField != null) {
String timeLineRegex = null;
StringBuilder builder = new StringBuilder("^");
// We make the assumption that the timestamp will be on the first line of each record. Therefore, if the
// timestamp is the last column then either our assumption is wrong (and the approach will completely
// break down) or else every record is on a single line and there's no point creating a multiline config.
// This is why the loop excludes the last column.
for (String column : Arrays.asList(header).subList(0, header.length - 1)) {
if (timeField.v1().equals(column)) {
builder.append("\"?");
String simpleTimePattern = timeField.v2().simplePattern.pattern();
builder.append(simpleTimePattern.startsWith("\\b") ? simpleTimePattern.substring(2) : simpleTimePattern);
timeLineRegex = builder.toString();
break;
} else {
builder.append(".*?");
if (delimiter == '\t') {
builder.append("\\t");
} else {
builder.append(delimiter);
}
}
}
if (isHeaderInFile) {
structureBuilder.setExcludeLinesPattern("^" + Arrays.stream(header)
.map(column -> "\"?" + column.replace("\"", "\"\"").replaceAll("([\\\\|()\\[\\]{}^$*?])", "\\\\$1") + "\"?")
.collect(Collectors.joining(",")));
}
structureBuilder.setTimestampField(timeField.v1())
.setTimestampFormats(timeField.v2().dateFormats)
.setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing())
.setMultilineStartPattern(timeLineRegex);
}
SortedMap<String, Object> mappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
LogStructure structure = structureBuilder
.setMappings(mappings)
.setExplanation(explanation)
.build();
return new SeparatedValuesLogStructureFinder(sampleMessages, structure);
}
private SeparatedValuesLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
this.sampleMessages = Collections.unmodifiableList(sampleMessages);
this.structure = structure;
}
@Override
public List<String> getSampleMessages() {
return sampleMessages;
}
@Override
public LogStructure getStructure() {
return structure;
}
static Tuple<List<List<String>>, List<Integer>> readRows(String sample, CsvPreference csvPreference) throws IOException {
int fieldsInFirstRow = -1;
List<List<String>> rows = new ArrayList<>();
List<Integer> lineNumbers = new ArrayList<>();
try (CsvListReader csvReader = new CsvListReader(new StringReader(sample), csvPreference)) {
try {
List<String> row;
while ((row = csvReader.read()) != null) {
if (fieldsInFirstRow < 0) {
fieldsInFirstRow = row.size();
} else {
// Tolerate extra columns if and only if they're empty
while (row.size() > fieldsInFirstRow && row.get(row.size() - 1) == null) {
row.remove(row.size() - 1);
}
}
rows.add(row);
lineNumbers.add(csvReader.getLineNumber());
}
} catch (SuperCsvException e) {
// Tolerate an incomplete last row
if (notUnexpectedEndOfFile(e)) {
throw e;
}
}
}
assert rows.isEmpty() == false;
assert lineNumbers.size() == rows.size();
if (rows.get(0).size() != rows.get(rows.size() - 1).size()) {
rows.remove(rows.size() - 1);
lineNumbers.remove(lineNumbers.size() - 1);
}
// This should have been enforced by canCreateFromSample()
assert rows.size() > 1;
return new Tuple<>(rows, lineNumbers);
}
static Tuple<Boolean, String[]> findHeaderFromSample(List<String> explanation, List<List<String>> rows) {
assert rows.isEmpty() == false;
List<String> firstRow = rows.get(0);
boolean isHeaderInFile = true;
if (rowContainsDuplicateNonEmptyValues(firstRow)) {
isHeaderInFile = false;
explanation.add("First row contains duplicate values, so assuming it's not a header");
} else {
if (rows.size() < 3) {
explanation.add("Too little data to accurately assess whether header is in sample - guessing it is");
} else {
isHeaderInFile = isFirstRowUnusual(explanation, rows);
}
}
if (isHeaderInFile) {
// SuperCSV will put nulls in the header if any columns don't have names, but empty strings are better for us
return new Tuple<>(true, firstRow.stream().map(field -> (field == null) ? "" : field).toArray(String[]::new));
} else {
return new Tuple<>(false, IntStream.rangeClosed(1, firstRow.size()).mapToObj(num -> "column" + num).toArray(String[]::new));
}
}
static boolean rowContainsDuplicateNonEmptyValues(List<String> row) {
HashSet<String> values = new HashSet<>();
for (String value : row) {
if (value != null && value.isEmpty() == false && values.add(value) == false) {
return true;
}
}
return false;
}
private static boolean isFirstRowUnusual(List<String> explanation, List<List<String>> rows) {
assert rows.size() >= 3;
List<String> firstRow = rows.get(0);
String firstRowStr = firstRow.stream().map(field -> (field == null) ? "" : field).collect(Collectors.joining(""));
List<List<String>> otherRows = rows.subList(1, rows.size());
List<String> otherRowStrs = new ArrayList<>();
for (List<String> row : otherRows) {
otherRowStrs.add(row.stream().map(str -> (str == null) ? "" : str).collect(Collectors.joining("")));
}
// Check lengths
double firstRowLength = firstRowStr.length();
DoubleSummaryStatistics otherRowStats = otherRowStrs.stream().mapToDouble(otherRow -> (double) otherRow.length())
.collect(DoubleSummaryStatistics::new, DoubleSummaryStatistics::accept, DoubleSummaryStatistics::combine);
double otherLengthRange = otherRowStats.getMax() - otherRowStats.getMin();
if (firstRowLength < otherRowStats.getMin() - otherLengthRange / 10.0 ||
firstRowLength > otherRowStats.getMax() + otherLengthRange / 10.0) {
explanation.add("First row is unusual based on length test: [" + firstRowLength + "] and [" +
toNiceString(otherRowStats) + "]");
return true;
}
explanation.add("First row is not unusual based on length test: [" + firstRowLength + "] and [" +
toNiceString(otherRowStats) + "]");
// Check edit distances
DoubleSummaryStatistics firstRowStats = otherRows.stream().limit(MAX_LEVENSHTEIN_COMPARISONS)
.mapToDouble(otherRow -> (double) levenshteinFieldwiseCompareRows(firstRow, otherRow))
.collect(DoubleSummaryStatistics::new, DoubleSummaryStatistics::accept, DoubleSummaryStatistics::combine);
otherRowStats = new DoubleSummaryStatistics();
int numComparisons = 0;
int proportion = otherRowStrs.size() / MAX_LEVENSHTEIN_COMPARISONS;
int innerIncrement = 1 + proportion * proportion;
Random random = new Random(firstRow.hashCode());
for (int i = 0; numComparisons < MAX_LEVENSHTEIN_COMPARISONS && i < otherRowStrs.size(); ++i) {
for (int j = i + 1 + random.nextInt(innerIncrement); numComparisons < MAX_LEVENSHTEIN_COMPARISONS && j < otherRowStrs.size();
j += innerIncrement) {
otherRowStats.accept((double) levenshteinFieldwiseCompareRows(otherRows.get(i), otherRows.get(j)));
++numComparisons;
}
}
if (firstRowStats.getAverage() > otherRowStats.getAverage() * 1.2) {
explanation.add("First row is unusual based on Levenshtein test [" + toNiceString(firstRowStats) +
"] and [" + toNiceString(otherRowStats) + "]");
return true;
}
explanation.add("First row is not unusual based on Levenshtein test [" + toNiceString(firstRowStats) +
"] and [" + toNiceString(otherRowStats) + "]");
return false;
}
private static String toNiceString(DoubleSummaryStatistics stats) {
return String.format(Locale.ROOT, "count=%d, min=%f, average=%f, max=%f", stats.getCount(), stats.getMin(), stats.getAverage(),
stats.getMax());
}
/**
* Sum of the Levenshtein distances between corresponding elements
* in the two supplied lists _excluding_ the biggest difference.
* The reason the biggest difference is excluded is that sometimes
* there's a "message" field that is much longer than any of the other
* fields, varies enormously between rows, and skews the comparison.
*/
static int levenshteinFieldwiseCompareRows(List<String> firstRow, List<String> secondRow) {
int largestSize = Math.max(firstRow.size(), secondRow.size());
if (largestSize <= 1) {
return 0;
}
int[] distances = new int[largestSize];
for (int index = 0; index < largestSize; ++index) {
distances[index] = levenshteinDistance((index < firstRow.size()) ? firstRow.get(index) : "",
(index < secondRow.size()) ? secondRow.get(index) : "");
}
Arrays.sort(distances);
return IntStream.of(distances).limit(distances.length - 1).sum();
}
/**
* This method implements the simple algorithm for calculating Levenshtein distance.
*/
static int levenshteinDistance(String first, String second) {
// There are some examples with pretty pictures of the matrix on Wikipedia here:
// http://en.wikipedia.org/wiki/Levenshtein_distance
int firstLen = (first == null) ? 0 : first.length();
int secondLen = (second == null) ? 0 : second.length();
if (firstLen == 0) {
return secondLen;
}
if (secondLen == 0) {
return firstLen;
}
int[] currentCol = new int[secondLen + 1];
int[] prevCol = new int[secondLen + 1];
// Populate the left column
for (int down = 0; down <= secondLen; ++down) {
currentCol[down] = down;
}
// Calculate the other entries in the matrix
for (int across = 1; across <= firstLen; ++across) {
int[] tmp = prevCol;
prevCol = currentCol;
// We could allocate a new array for currentCol here, but it's more efficient to reuse the one that's now redundant
currentCol = tmp;
currentCol[0] = across;
for (int down = 1; down <= secondLen; ++down) {
// Do the strings differ at the point we've reached?
if (first.charAt(across - 1) == second.charAt(down - 1)) {
// No, they're the same => no extra cost
currentCol[down] = prevCol[down - 1];
} else {
// Yes, they differ, so there are 3 options:
// 1) Deletion => cell to the left's value plus 1
int option1 = prevCol[down];
// 2) Insertion => cell above's value plus 1
int option2 = currentCol[down - 1];
// 3) Substitution => cell above left's value plus 1
int option3 = prevCol[down - 1];
// Take the cheapest option of the 3
currentCol[down] = Math.min(Math.min(option1, option2), option3) + 1;
}
}
}
// Result is the value in the bottom right hand corner of the matrix
return currentCol[secondLen];
}
static boolean lineHasUnescapedQuote(String line, CsvPreference csvPreference) {
char quote = csvPreference.getQuoteChar();
String lineWithEscapedQuotesRemoved = line.replace(String.valueOf(quote) + quote, "");
for (int index = 1; index < lineWithEscapedQuotesRemoved.length() - 1; ++index) {
if (lineWithEscapedQuotesRemoved.charAt(index) == quote &&
lineWithEscapedQuotesRemoved.codePointAt(index - 1) != csvPreference.getDelimiterChar() &&
lineWithEscapedQuotesRemoved.codePointAt(index + 1) != csvPreference.getDelimiterChar()) {
return true;
}
}
return false;
}
static boolean canCreateFromSample(List<String> explanation, String sample, int minFieldsPerRow, CsvPreference csvPreference,
String formatName) {
// Logstash's CSV parser won't tolerate fields where just part of the
// value is quoted, whereas SuperCSV will, hence this extra check
String[] sampleLines = sample.split("\n");
for (String sampleLine : sampleLines) {
if (lineHasUnescapedQuote(sampleLine, csvPreference)) {
explanation.add("Not " + formatName +
" because a line has an unescaped quote that is not at the beginning or end of a field: [" + sampleLine + "]");
return false;
}
}
try (CsvListReader csvReader = new CsvListReader(new StringReader(sample), csvPreference)) {
int fieldsInFirstRow = -1;
int fieldsInLastRow = -1;
int numberOfRows = 0;
try {
List<String> row;
while ((row = csvReader.read()) != null) {
int fieldsInThisRow = row.size();
++numberOfRows;
if (fieldsInFirstRow < 0) {
fieldsInFirstRow = fieldsInThisRow;
if (fieldsInFirstRow < minFieldsPerRow) {
explanation.add("Not " + formatName + " because the first row has fewer than [" + minFieldsPerRow +
"] fields: [" + fieldsInFirstRow + "]");
return false;
}
fieldsInLastRow = fieldsInFirstRow;
continue;
}
// Tolerate extra columns if and only if they're empty
while (fieldsInThisRow > fieldsInFirstRow && row.get(fieldsInThisRow - 1) == null) {
--fieldsInThisRow;
}
if (fieldsInLastRow != fieldsInFirstRow) {
explanation.add("Not " + formatName + " because row [" + (numberOfRows - 1) +
"] has a different number of fields to the first row: [" + fieldsInFirstRow + "] and [" +
fieldsInLastRow + "]");
return false;
}
fieldsInLastRow = fieldsInThisRow;
}
if (fieldsInLastRow > fieldsInFirstRow) {
explanation.add("Not " + formatName + " because last row has more fields than first row: [" + fieldsInFirstRow +
"] and [" + fieldsInLastRow + "]");
return false;
}
if (fieldsInLastRow < fieldsInFirstRow) {
--numberOfRows;
}
} catch (SuperCsvException e) {
// Tolerate an incomplete last row
if (notUnexpectedEndOfFile(e)) {
explanation.add("Not " + formatName + " because there was a parsing exception: [" + e.getMessage() + "]");
return false;
}
}
if (numberOfRows <= 1) {
explanation.add("Not " + formatName + " because fewer than 2 complete records in sample: [" + numberOfRows + "]");
return false;
}
explanation.add("Deciding sample is " + formatName);
return true;
} catch (IOException e) {
explanation.add("Not " + formatName + " because there was a parsing exception: [" + e.getMessage() + "]");
return false;
}
}
private static boolean notUnexpectedEndOfFile(SuperCsvException e) {
return e.getMessage().startsWith("unexpected end of file while reading quoted column") == false;
}
}

View File

@ -0,0 +1,201 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Pattern;
public class TextLogStructureFinder implements LogStructureFinder {
private final List<String> sampleMessages;
private final LogStructure structure;
static TextLogStructureFinder makeTextLogStructureFinder(List<String> explanation, String sample, String charsetName,
Boolean hasByteOrderMarker) {
String[] sampleLines = sample.split("\n");
Tuple<TimestampMatch, Set<String>> bestTimestamp = mostLikelyTimestamp(sampleLines);
if (bestTimestamp == null) {
// Is it appropriate to treat a file that is neither structured nor has
// a regular pattern of timestamps as a log file? Probably not...
throw new IllegalArgumentException("Could not find a timestamp in the log sample provided");
}
explanation.add("Most likely timestamp format is [" + bestTimestamp.v1() + "]");
List<String> sampleMessages = new ArrayList<>();
StringBuilder preamble = new StringBuilder();
int linesConsumed = 0;
StringBuilder message = null;
int linesInMessage = 0;
String multiLineRegex = createMultiLineMessageStartRegex(bestTimestamp.v2(), bestTimestamp.v1().simplePattern.pattern());
Pattern multiLinePattern = Pattern.compile(multiLineRegex);
for (String sampleLine : sampleLines) {
if (multiLinePattern.matcher(sampleLine).find()) {
if (message != null) {
sampleMessages.add(message.toString());
linesConsumed += linesInMessage;
}
message = new StringBuilder(sampleLine);
linesInMessage = 1;
} else {
// If message is null here then the sample probably began with the incomplete ending of a previous message
if (message == null) {
// We count lines before the first message as consumed (just like we would
// for the CSV header or lines before the first XML document starts)
++linesConsumed;
} else {
message.append('\n').append(sampleLine);
++linesInMessage;
}
}
if (sampleMessages.size() < 2) {
preamble.append(sampleLine).append('\n');
}
}
// Don't add the last message, as it might be partial and mess up subsequent pattern finding
LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.SEMI_STRUCTURED_TEXT)
.setCharset(charsetName)
.setHasByteOrderMarker(hasByteOrderMarker)
.setSampleStart(preamble.toString())
.setNumLinesAnalyzed(linesConsumed)
.setNumMessagesAnalyzed(sampleMessages.size())
.setMultilineStartPattern(multiLineRegex);
SortedMap<String, Object> mappings = new TreeMap<>();
mappings.put("message", Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "text"));
mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
// We can't parse directly into @timestamp using Grok, so parse to some other time field, which the date filter will then remove
String interimTimestampField;
String grokPattern;
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
Tuple<String, String> timestampFieldAndFullMatchGrokPattern = grokPatternCreator.findFullLineGrokPattern();
if (timestampFieldAndFullMatchGrokPattern != null) {
interimTimestampField = timestampFieldAndFullMatchGrokPattern.v1();
grokPattern = timestampFieldAndFullMatchGrokPattern.v2();
} else {
interimTimestampField = "timestamp";
grokPattern = grokPatternCreator.createGrokPatternFromExamples(bestTimestamp.v1().grokPatternName, interimTimestampField);
}
LogStructure structure = structureBuilder
.setTimestampField(interimTimestampField)
.setTimestampFormats(bestTimestamp.v1().dateFormats)
.setNeedClientTimezone(bestTimestamp.v1().hasTimezoneDependentParsing())
.setGrokPattern(grokPattern)
.setMappings(mappings)
.setExplanation(explanation)
.build();
return new TextLogStructureFinder(sampleMessages, structure);
}
private TextLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
this.sampleMessages = Collections.unmodifiableList(sampleMessages);
this.structure = structure;
}
@Override
public List<String> getSampleMessages() {
return sampleMessages;
}
@Override
public LogStructure getStructure() {
return structure;
}
static Tuple<TimestampMatch, Set<String>> mostLikelyTimestamp(String[] sampleLines) {
Map<TimestampMatch, Tuple<Double, Set<String>>> timestampMatches = new LinkedHashMap<>();
int remainingLines = sampleLines.length;
double differenceBetweenTwoHighestWeights = 0.0;
for (String sampleLine : sampleLines) {
TimestampMatch match = TimestampFormatFinder.findFirstMatch(sampleLine);
if (match != null) {
TimestampMatch pureMatch = new TimestampMatch(match.candidateIndex, "", match.dateFormats, match.simplePattern,
match.grokPatternName, "");
timestampMatches.compute(pureMatch, (k, v) -> {
if (v == null) {
return new Tuple<>(weightForMatch(match.preface), new HashSet<>(Collections.singletonList(match.preface)));
} else {
v.v2().add(match.preface);
return new Tuple<>(v.v1() + weightForMatch(match.preface), v.v2());
}
});
differenceBetweenTwoHighestWeights = findDifferenceBetweenTwoHighestWeights(timestampMatches.values());
}
// The highest possible weight is 1, so if the difference between the two highest weights
// is less than the number of lines remaining then the leader cannot possibly be overtaken
if (differenceBetweenTwoHighestWeights > --remainingLines) {
break;
}
}
double highestWeight = 0.0;
Tuple<TimestampMatch, Set<String>> highestWeightMatch = null;
for (Map.Entry<TimestampMatch, Tuple<Double, Set<String>>> entry : timestampMatches.entrySet()) {
double weight = entry.getValue().v1();
if (weight > highestWeight) {
highestWeight = weight;
highestWeightMatch = new Tuple<>(entry.getKey(), entry.getValue().v2());
}
}
return highestWeightMatch;
}
/**
* Used to weight a timestamp match according to how far along the line it is found.
* Timestamps at the very beginning of the line are given a weight of 1. The weight
* progressively decreases the more text there is preceding the timestamp match, but
* is always greater than 0.
* @return A weight in the range (0, 1].
*/
private static double weightForMatch(String preface) {
return Math.pow(1.0 + preface.length() / 15.0, -1.1);
}
private static double findDifferenceBetweenTwoHighestWeights(Collection<Tuple<Double, Set<String>>> timestampMatches) {
double highestWeight = 0.0;
double secondHighestWeight = 0.0;
for (Tuple<Double, Set<String>> timestampMatch : timestampMatches) {
double weight = timestampMatch.v1();
if (weight > highestWeight) {
secondHighestWeight = highestWeight;
highestWeight = weight;
} else if (weight > secondHighestWeight) {
secondHighestWeight = weight;
}
}
return highestWeight - secondHighestWeight;
}
static String createMultiLineMessageStartRegex(Collection<String> prefaces, String timestampRegex) {
StringBuilder builder = new StringBuilder("^");
GrokPatternCreator.addIntermediateRegex(builder, prefaces);
builder.append(timestampRegex);
if (builder.substring(0, 3).equals("^\\b")) {
builder.delete(1, 3);
}
return builder.toString();
}
}

View File

@ -0,0 +1,39 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import java.util.List;
import java.util.regex.Pattern;
public class TextLogStructureFinderFactory implements LogStructureFinderFactory {
// This works because, by default, dot doesn't match newlines
private static final Pattern TWO_NON_BLANK_LINES_PATTERN = Pattern.compile(".\n+.");
/**
* This format matches if the sample contains at least one newline and at least two
* non-blank lines.
*/
@Override
public boolean canCreateFromSample(List<String> explanation, String sample) {
if (sample.indexOf('\n') < 0) {
explanation.add("Not text because sample contains no newlines");
return false;
}
if (TWO_NON_BLANK_LINES_PATTERN.matcher(sample).find() == false) {
explanation.add("Not text because sample contains fewer than two non-blank lines");
return false;
}
explanation.add("Deciding sample is text");
return true;
}
@Override
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker) {
return TextLogStructureFinder.makeTextLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker);
}
}

View File

@ -0,0 +1,427 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.grok.Grok;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Used to find the best timestamp format for one of the following situations:
* 1. Matching an entire field value
* 2. Matching a timestamp found somewhere within a message
*/
public final class TimestampFormatFinder {
private static final String PREFACE = "preface";
private static final String EPILOGUE = "epilogue";
private static final Pattern FRACTIONAL_SECOND_INTERPRETER = Pattern.compile("([:.,])(\\d{3,9})");
private static final char DEFAULT_FRACTIONAL_SECOND_SEPARATOR = ',';
/**
* The timestamp patterns are complex and it can be slow to prove they do not
* match anywhere in a long message. Many of the timestamps are similar and
* will never be found in a string if simpler sub-patterns do not exist in the
* string. These sub-patterns can be used to quickly rule out multiple complex
* patterns. These patterns do not need to represent quantities that are
* useful to know the value of, merely character sequences that can be used to
* prove that <em>several</em> more complex patterns cannot possibly match.
*/
private static final List<Pattern> QUICK_RULE_OUT_PATTERNS = Arrays.asList(
// YYYY-MM-dd followed by a space
Pattern.compile("\\b\\d{4}-\\d{2}-\\d{2} "),
// The end of some number (likely year or day) followed by a space then HH:mm
Pattern.compile("\\d \\d{2}:\\d{2}\\b"),
// HH:mm:ss surrounded by spaces
Pattern.compile(" \\d{2}:\\d{2}:\\d{2} ")
);
/**
* The first match in this list will be chosen, so it needs to be ordered
* such that more generic patterns come after more specific patterns.
*/
static final List<CandidateTimestampFormat> ORDERED_CANDIDATE_FORMATS = Arrays.asList(
// The TOMCAT_DATESTAMP format has to come before ISO8601 because it's basically ISO8601 but
// with a space before the timezone, and because the timezone is optional in ISO8601 it will
// be recognised as that with the timezone missed off if ISO8601 is checked first
new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSS Z", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
"\\b20\\d{2}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9} (?:Z|[+-]%{HOUR}%{MINUTE})\\b",
"TOMCAT_DATESTAMP", Arrays.asList(0, 1)),
// The Elasticsearch ISO8601 parser requires a literal T between the date and time, so
// longhand formats are needed if there's a space instead
new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSSZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
"\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}(?:Z|[+-]%{HOUR}%{MINUTE})\\b",
"TIMESTAMP_ISO8601", Arrays.asList(0, 1)),
new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSSZZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
"\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}[+-]%{HOUR}:%{MINUTE}\\b",
"TIMESTAMP_ISO8601", Arrays.asList(0, 1)),
new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSS", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
"\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}\\b", "TIMESTAMP_ISO8601",
Arrays.asList(0, 1)),
new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ssZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
"\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)(?:Z|[+-]%{HOUR}%{MINUTE})\\b", "TIMESTAMP_ISO8601",
Arrays.asList(0, 1)),
new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ssZZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
"\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[+-]%{HOUR}:%{MINUTE}\\b", "TIMESTAMP_ISO8601",
Arrays.asList(0, 1)),
new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
"\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)\\b", "TIMESTAMP_ISO8601",
Arrays.asList(0, 1)),
new CandidateTimestampFormat("ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "\\b%{TIMESTAMP_ISO8601}\\b",
"TIMESTAMP_ISO8601"),
new CandidateTimestampFormat("EEE MMM dd YYYY HH:mm:ss zzz",
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2} ",
"\\b%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{TZ}\\b", "DATESTAMP_RFC822", Arrays.asList(1, 2)),
new CandidateTimestampFormat("EEE MMM dd YYYY HH:mm zzz", "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2} ",
"\\b%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE} %{TZ}\\b", "DATESTAMP_RFC822", Collections.singletonList(1)),
new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm:ss ZZ",
"\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ",
"\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:Z|[+-]%{HOUR}:%{MINUTE})\\b",
"DATESTAMP_RFC2822", Arrays.asList(1, 2)),
new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm:ss Z",
"\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ",
"\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:Z|[+-]%{HOUR}%{MINUTE})\\b",
"DATESTAMP_RFC2822", Arrays.asList(1, 2)),
new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm ZZ", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ",
"\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE} (?:Z|[+-]%{HOUR}:%{MINUTE})\\b", "DATESTAMP_RFC2822",
Collections.singletonList(1)),
new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm Z", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ",
"\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE} (?:Z|[+-]%{HOUR}%{MINUTE})\\b", "DATESTAMP_RFC2822",
Collections.singletonList(1)),
new CandidateTimestampFormat("EEE MMM dd HH:mm:ss zzz YYYY",
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b",
"\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{TZ} %{YEAR}\\b", "DATESTAMP_OTHER",
Arrays.asList(1, 2)),
new CandidateTimestampFormat("EEE MMM dd HH:mm zzz YYYY",
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b",
"\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE} %{TZ} %{YEAR}\\b", "DATESTAMP_OTHER", Collections.singletonList(1)),
new CandidateTimestampFormat("YYYYMMddHHmmss", "\\b\\d{14}\\b",
"\\b20\\d{2}%{MONTHNUM2}(?:(?:0[1-9])|(?:[12][0-9])|(?:3[01]))(?:2[0123]|[01][0-9])%{MINUTE}(?:[0-5][0-9]|60)\\b",
"DATESTAMP_EVENTLOG"),
new CandidateTimestampFormat("EEE MMM dd HH:mm:ss YYYY",
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b",
"\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{YEAR}\\b", "HTTPDERROR_DATE", Arrays.asList(1, 2)),
new CandidateTimestampFormat(Arrays.asList("MMM dd HH:mm:ss,SSS", "MMM d HH:mm:ss,SSS"),
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
"%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}\\b", "SYSLOGTIMESTAMP",
Collections.singletonList(1)),
new CandidateTimestampFormat(Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"),
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b",
"SYSLOGTIMESTAMP", Collections.singletonList(1)),
new CandidateTimestampFormat("dd/MMM/YYYY:HH:mm:ss Z", "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ",
"\\b%{MONTHDAY}/%{MONTH}/%{YEAR}:%{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) [+-]?%{HOUR}%{MINUTE}\\b", "HTTPDATE"),
new CandidateTimestampFormat("MMM dd, YYYY K:mm:ss a", "\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b",
"%{MONTH} %{MONTHDAY}, 20\\d{2} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:AM|PM)\\b", "CATALINA_DATESTAMP"),
new CandidateTimestampFormat(Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"),
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b",
"%{MONTH} +%{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b", "CISCOTIMESTAMP", Collections.singletonList(1)),
new CandidateTimestampFormat("UNIX_MS", "\\b\\d{13}\\b", "\\b\\d{13}\\b", "POSINT"),
new CandidateTimestampFormat("UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "\\b\\d{10}\\.(?:\\d{3}){1,3}\\b", "NUMBER"),
new CandidateTimestampFormat("UNIX", "\\b\\d{10}\\b", "\\b\\d{10}\\b", "POSINT"),
new CandidateTimestampFormat("TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM")
);
private TimestampFormatFinder() {
}
/**
* Find the first timestamp format that matches part of the supplied value.
* @param text The value that the returned timestamp format must exist within.
* @return The timestamp format, or <code>null</code> if none matches.
*/
public static TimestampMatch findFirstMatch(String text) {
return findFirstMatch(text, 0);
}
/**
* Find the first timestamp format that matches part of the supplied value,
* excluding a specified number of candidate formats.
* @param text The value that the returned timestamp format must exist within.
* @param ignoreCandidates The number of candidate formats to exclude from the search.
* @return The timestamp format, or <code>null</code> if none matches.
*/
public static TimestampMatch findFirstMatch(String text, int ignoreCandidates) {
Boolean[] quickRuleoutMatches = new Boolean[QUICK_RULE_OUT_PATTERNS.size()];
int index = ignoreCandidates;
for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) {
boolean quicklyRuledOut = false;
for (Integer quickRuleOutIndex : candidate.quickRuleOutIndices) {
if (quickRuleoutMatches[quickRuleOutIndex] == null) {
quickRuleoutMatches[quickRuleOutIndex] = QUICK_RULE_OUT_PATTERNS.get(quickRuleOutIndex).matcher(text).find();
}
if (quickRuleoutMatches[quickRuleOutIndex] == false) {
quicklyRuledOut = true;
break;
}
}
if (quicklyRuledOut == false) {
Map<String, Object> captures = candidate.strictSearchGrok.captures(text);
if (captures != null) {
String preface = captures.getOrDefault(PREFACE, "").toString();
String epilogue = captures.getOrDefault(EPILOGUE, "").toString();
return makeTimestampMatch(candidate, index, preface, text.substring(preface.length(),
text.length() - epilogue.length()), epilogue);
}
}
++index;
}
return null;
}
/**
* Find the best timestamp format for matching an entire field value.
* @param text The value that the returned timestamp format must match in its entirety.
* @return The timestamp format, or <code>null</code> if none matches.
*/
public static TimestampMatch findFirstFullMatch(String text) {
return findFirstFullMatch(text, 0);
}
/**
* Find the best timestamp format for matching an entire field value,
* excluding a specified number of candidate formats.
* @param text The value that the returned timestamp format must match in its entirety.
* @param ignoreCandidates The number of candidate formats to exclude from the search.
* @return The timestamp format, or <code>null</code> if none matches.
*/
public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidates) {
int index = ignoreCandidates;
for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) {
Map<String, Object> captures = candidate.strictFullMatchGrok.captures(text);
if (captures != null) {
return makeTimestampMatch(candidate, index, "", text, "");
}
++index;
}
return null;
}
private static TimestampMatch makeTimestampMatch(CandidateTimestampFormat chosenTimestampFormat, int chosenIndex,
String preface, String matchedDate, String epilogue) {
Tuple<Character, Integer> fractionalSecondsInterpretation = interpretFractionalSeconds(matchedDate);
List<String> dateFormats = chosenTimestampFormat.dateFormats;
Pattern simplePattern = chosenTimestampFormat.simplePattern;
char separator = fractionalSecondsInterpretation.v1();
if (separator != DEFAULT_FRACTIONAL_SECOND_SEPARATOR) {
dateFormats = dateFormats.stream().map(dateFormat -> dateFormat.replace(DEFAULT_FRACTIONAL_SECOND_SEPARATOR, separator))
.collect(Collectors.toList());
if (dateFormats.stream().noneMatch(dateFormat -> dateFormat.startsWith("UNIX"))) {
String patternStr = simplePattern.pattern();
int separatorPos = patternStr.lastIndexOf(DEFAULT_FRACTIONAL_SECOND_SEPARATOR);
if (separatorPos >= 0) {
StringBuilder newPatternStr = new StringBuilder(patternStr);
newPatternStr.replace(separatorPos, separatorPos + 1, ((separator == '.') ? "\\" : "") + separator);
simplePattern = Pattern.compile(newPatternStr.toString());
}
}
}
int numberOfDigitsInFractionalComponent = fractionalSecondsInterpretation.v2();
if (numberOfDigitsInFractionalComponent > 3) {
String fractionalSecondsFormat = "SSSSSSSSS".substring(0, numberOfDigitsInFractionalComponent);
dateFormats = dateFormats.stream().map(dateFormat -> dateFormat.replace("SSS", fractionalSecondsFormat))
.collect(Collectors.toList());
}
return new TimestampMatch(chosenIndex, preface, dateFormats, simplePattern, chosenTimestampFormat.standardGrokPatternName,
epilogue);
}
/**
* Interpret the fractional seconds component of a date to determine two things:
* 1. The separator character - one of colon, comma and dot.
* 2. The number of digits in the fractional component.
* @param date The textual representation of the date for which fractional seconds are to be interpreted.
* @return A tuple of (fractional second separator character, number of digits in fractional component).
*/
static Tuple<Character, Integer> interpretFractionalSeconds(String date) {
Matcher matcher = FRACTIONAL_SECOND_INTERPRETER.matcher(date);
if (matcher.find()) {
return new Tuple<>(matcher.group(1).charAt(0), matcher.group(2).length());
}
return new Tuple<>(DEFAULT_FRACTIONAL_SECOND_SEPARATOR, 0);
}
/**
* Represents a timestamp that has matched a field value or been found within a message.
*/
public static final class TimestampMatch {
/**
* The index of the corresponding entry in the <code>ORDERED_CANDIDATE_FORMATS</code> list.
*/
public final int candidateIndex;
/**
* Text that came before the timestamp in the matched field/message.
*/
public final String preface;
/**
* Time format specifier(s) that will work with Logstash and Ingest pipeline date parsers.
*/
public final List<String> dateFormats;
/**
* A simple regex that will work in many languages to detect whether the timestamp format
* exists in a particular line.
*/
public final Pattern simplePattern;
/**
* Name of an out-of-the-box Grok pattern that will match the timestamp.
*/
public final String grokPatternName;
/**
* Text that came after the timestamp in the matched field/message.
*/
public final String epilogue;
TimestampMatch(int candidateIndex, String preface, String dateFormat, String simpleRegex, String grokPatternName, String epilogue) {
this(candidateIndex, preface, Collections.singletonList(dateFormat), simpleRegex, grokPatternName, epilogue);
}
TimestampMatch(int candidateIndex, String preface, String dateFormat, String simpleRegex, String grokPatternName, String epilogue,
boolean hasFractionalComponentSmallerThanMillisecond) {
this(candidateIndex, preface, Collections.singletonList(dateFormat), simpleRegex, grokPatternName, epilogue);
}
TimestampMatch(int candidateIndex, String preface, List<String> dateFormats, String simpleRegex, String grokPatternName,
String epilogue) {
this(candidateIndex, preface, dateFormats, Pattern.compile(simpleRegex), grokPatternName, epilogue);
}
TimestampMatch(int candidateIndex, String preface, List<String> dateFormats, Pattern simplePattern, String grokPatternName,
String epilogue) {
this.candidateIndex = candidateIndex;
this.preface = preface;
this.dateFormats = dateFormats;
this.simplePattern = simplePattern;
this.grokPatternName = grokPatternName;
this.epilogue = epilogue;
}
/**
* Does the parsing the timestamp produce different results depending on the timezone of the parser?
* I.e., does the textual representation NOT define the timezone?
*/
public boolean hasTimezoneDependentParsing() {
return dateFormats.stream()
.anyMatch(dateFormat -> dateFormat.contains("HH") && dateFormat.toLowerCase(Locale.ROOT).indexOf('z') == -1);
}
/**
* Sometimes Elasticsearch mappings for dates need to include the format.
* This method returns appropriate mappings settings: at minimum "type"="date",
* and possibly also a "format" setting.
*/
public Map<String, String> getEsDateMappingTypeWithFormat() {
if (dateFormats.contains("TAI64N")) {
// There's no format for TAI64N in the date formats used in mappings
return Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword");
}
Map<String, String> mapping = new LinkedHashMap<>();
mapping.put(LogStructureUtils.MAPPING_TYPE_SETTING, "date");
String formats = dateFormats.stream().flatMap(format -> {
switch (format) {
case "ISO8601":
return Stream.empty();
case "UNIX_MS":
return Stream.of("epoch_millis");
case "UNIX":
return Stream.of("epoch_second");
default:
return Stream.of(format);
}
}).collect(Collectors.joining("||"));
if (formats.isEmpty() == false) {
mapping.put(LogStructureUtils.MAPPING_FORMAT_SETTING, formats);
}
return mapping;
}
@Override
public int hashCode() {
return Objects.hash(candidateIndex, preface, dateFormats, simplePattern.pattern(), grokPatternName, epilogue);
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (other == null || getClass() != other.getClass()) {
return false;
}
TimestampMatch that = (TimestampMatch) other;
return this.candidateIndex == that.candidateIndex &&
Objects.equals(this.preface, that.preface) &&
Objects.equals(this.dateFormats, that.dateFormats) &&
Objects.equals(this.simplePattern.pattern(), that.simplePattern.pattern()) &&
Objects.equals(this.grokPatternName, that.grokPatternName) &&
Objects.equals(this.epilogue, that.epilogue);
}
@Override
public String toString() {
return "index = " + candidateIndex + (preface.isEmpty() ? "" : ", preface = '" + preface + "'") +
", date formats = " + dateFormats.stream().collect(Collectors.joining("', '", "[ '", "' ]")) +
", simple pattern = '" + simplePattern.pattern() + "', grok pattern = '" + grokPatternName + "'" +
(epilogue.isEmpty() ? "" : ", epilogue = '" + epilogue + "'");
}
}
static final class CandidateTimestampFormat {
final List<String> dateFormats;
final Pattern simplePattern;
final Grok strictSearchGrok;
final Grok strictFullMatchGrok;
final String standardGrokPatternName;
final List<Integer> quickRuleOutIndices;
CandidateTimestampFormat(String dateFormat, String simpleRegex, String strictGrokPattern, String standardGrokPatternName) {
this(Collections.singletonList(dateFormat), simpleRegex, strictGrokPattern, standardGrokPatternName);
}
CandidateTimestampFormat(String dateFormat, String simpleRegex, String strictGrokPattern, String standardGrokPatternName,
List<Integer> quickRuleOutIndices) {
this(Collections.singletonList(dateFormat), simpleRegex, strictGrokPattern, standardGrokPatternName, quickRuleOutIndices);
}
CandidateTimestampFormat(List<String> dateFormats, String simpleRegex, String strictGrokPattern, String standardGrokPatternName) {
this(dateFormats, simpleRegex, strictGrokPattern, standardGrokPatternName, Collections.emptyList());
}
CandidateTimestampFormat(List<String> dateFormats, String simpleRegex, String strictGrokPattern, String standardGrokPatternName,
List<Integer> quickRuleOutIndices) {
this.dateFormats = dateFormats;
this.simplePattern = Pattern.compile(simpleRegex, Pattern.MULTILINE);
// The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java
this.strictSearchGrok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + strictGrokPattern +
"%{GREEDYDATA:" + EPILOGUE + "}");
this.strictFullMatchGrok = new Grok(Grok.getBuiltinPatterns(), strictGrokPattern);
this.standardGrokPatternName = standardGrokPatternName;
assert quickRuleOutIndices.stream()
.noneMatch(quickRuleOutIndex -> quickRuleOutIndex < 0 || quickRuleOutIndex >= QUICK_RULE_OUT_PATTERNS.size());
this.quickRuleOutIndices = quickRuleOutIndices;
}
}
}

View File

@ -0,0 +1,35 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.supercsv.prefs.CsvPreference;
import java.io.IOException;
import java.util.List;
public class TsvLogStructureFinderFactory implements LogStructureFinderFactory {
/**
* Rules are:
* - The file must be valid TSV
* - It must contain at least two complete records
* - There must be at least two fields per record (otherwise files with no tabs could be treated as TSV!)
* - Every TSV record except the last must have the same number of fields
* The reason the last record is allowed to have fewer fields than the others is that
* it could have been truncated when the file was sampled.
*/
@Override
public boolean canCreateFromSample(List<String> explanation, String sample) {
return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 2, CsvPreference.TAB_PREFERENCE, "TSV");
}
@Override
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
throws IOException {
return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
CsvPreference.TAB_PREFERENCE, false);
}
}

View File

@ -0,0 +1,172 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Pattern;
public class XmlLogStructureFinder implements LogStructureFinder {
private final List<String> sampleMessages;
private final LogStructure structure;
static XmlLogStructureFinder makeXmlLogStructureFinder(List<String> explanation, String sample, String charsetName,
Boolean hasByteOrderMarker)
throws IOException, ParserConfigurationException, SAXException {
String messagePrefix;
try (Scanner scanner = new Scanner(sample)) {
messagePrefix = scanner.next();
}
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
docBuilderFactory.setNamespaceAware(false);
docBuilderFactory.setValidating(false);
List<String> sampleMessages = new ArrayList<>();
List<Map<String, ?>> sampleRecords = new ArrayList<>();
String[] sampleDocEnds = sample.split(Pattern.quote(messagePrefix));
StringBuilder preamble = new StringBuilder(sampleDocEnds[0]);
int linesConsumed = numNewlinesIn(sampleDocEnds[0]);
for (int i = 1; i < sampleDocEnds.length; ++i) {
String sampleDoc = messagePrefix + sampleDocEnds[i];
if (i < 3) {
preamble.append(sampleDoc);
}
DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
try (InputStream is = new ByteArrayInputStream(sampleDoc.getBytes(StandardCharsets.UTF_8))) {
sampleRecords.add(docToMap(docBuilder.parse(is)));
sampleMessages.add(sampleDoc);
linesConsumed += numNewlinesIn(sampleDoc);
} catch (SAXException e) {
// Tolerate an incomplete last record as long as we have one complete record
if (sampleRecords.isEmpty() || i < sampleDocEnds.length - 1) {
throw e;
}
}
}
if (sample.endsWith("\n") == false) {
++linesConsumed;
}
// If we get here the XML parser should have confirmed this
assert messagePrefix.charAt(0) == '<';
String topLevelTag = messagePrefix.substring(1);
LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.XML)
.setCharset(charsetName)
.setHasByteOrderMarker(hasByteOrderMarker)
.setSampleStart(preamble.toString())
.setNumLinesAnalyzed(linesConsumed)
.setNumMessagesAnalyzed(sampleRecords.size())
.setMultilineStartPattern("^\\s*<" + topLevelTag);
Tuple<String, TimestampMatch> timeField = LogStructureUtils.guessTimestampField(explanation, sampleRecords);
if (timeField != null) {
structureBuilder.setTimestampField(timeField.v1())
.setTimestampFormats(timeField.v2().dateFormats)
.setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing());
}
SortedMap<String, Object> innerMappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
Map<String, Object> secondLevelProperties = new LinkedHashMap<>();
secondLevelProperties.put(LogStructureUtils.MAPPING_TYPE_SETTING, "object");
secondLevelProperties.put(LogStructureUtils.MAPPING_PROPERTIES_SETTING, innerMappings);
SortedMap<String, Object> outerMappings = new TreeMap<>();
outerMappings.put(topLevelTag, secondLevelProperties);
outerMappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD,
Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
LogStructure structure = structureBuilder
.setMappings(outerMappings)
.setExplanation(explanation)
.build();
return new XmlLogStructureFinder(sampleMessages, structure);
}
private XmlLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
this.sampleMessages = Collections.unmodifiableList(sampleMessages);
this.structure = structure;
}
@Override
public List<String> getSampleMessages() {
return sampleMessages;
}
@Override
public LogStructure getStructure() {
return structure;
}
private static int numNewlinesIn(String str) {
return (int) str.chars().filter(c -> c == '\n').count();
}
private static Map<String, Object> docToMap(Document doc) {
Map<String, Object> docAsMap = new LinkedHashMap<>();
doc.getDocumentElement().normalize();
addNodeToMap(doc.getDocumentElement(), docAsMap);
return docAsMap;
}
private static void addNodeToMap(Node node, Map<String, Object> nodeAsMap) {
NamedNodeMap attributes = node.getAttributes();
for (int i = 0; i < attributes.getLength(); ++i) {
Node attribute = attributes.item(i);
nodeAsMap.put(attribute.getNodeName(), attribute.getNodeValue());
}
NodeList children = node.getChildNodes();
for (int i = 0; i < children.getLength(); ++i) {
Node child = children.item(i);
if (child.getNodeType() == Node.ELEMENT_NODE) {
if (child.getChildNodes().getLength() == 1) {
Node grandChild = child.getChildNodes().item(0);
String value = grandChild.getNodeValue().trim();
if (value.isEmpty() == false) {
nodeAsMap.put(child.getNodeName(), value);
}
} else {
Map<String, Object> childNodeAsMap = new LinkedHashMap<>();
addNodeToMap(child, childNodeAsMap);
if (childNodeAsMap.isEmpty() == false) {
nodeAsMap.put(child.getNodeName(), childNodeAsMap);
}
}
}
}
}
}

View File

@ -0,0 +1,122 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.Location;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.List;
public class XmlLogStructureFinderFactory implements LogStructureFinderFactory {
private final XMLInputFactory xmlFactory;
public XmlLogStructureFinderFactory() {
xmlFactory = XMLInputFactory.newInstance();
xmlFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.FALSE);
xmlFactory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
}
/**
* This format matches if the sample consists of one or more XML documents,
* all with the same root element name. If there is more than one document,
* only whitespace is allowed in between them. The last one does not
* necessarily have to be complete (as the sample could have truncated it).
*/
@Override
public boolean canCreateFromSample(List<String> explanation, String sample) {
int completeDocCount = 0;
String commonRootElementName = null;
String remainder = sample.trim();
boolean mightBeAnotherDocument = !remainder.isEmpty();
// This processing is extremely complicated because it's necessary
// to create a new XML stream reader per document, but each one
// will read ahead so will potentially consume characters from the
// following document. We must therefore also recreate the string
// reader for each document.
while (mightBeAnotherDocument) {
try (Reader reader = new StringReader(remainder)) {
XMLStreamReader xmlReader = xmlFactory.createXMLStreamReader(reader);
try {
int nestingLevel = 0;
while ((mightBeAnotherDocument = xmlReader.hasNext())) {
switch (xmlReader.next()) {
case XMLStreamReader.START_ELEMENT:
if (nestingLevel++ == 0) {
String rootElementName = xmlReader.getLocalName();
if (commonRootElementName == null) {
commonRootElementName = rootElementName;
} else if (commonRootElementName.equals(rootElementName) == false) {
explanation.add("Not XML because different documents have different root " +
"element names: [" + commonRootElementName + "] and [" + rootElementName + "]");
return false;
}
}
break;
case XMLStreamReader.END_ELEMENT:
if (--nestingLevel < 0) {
explanation.add("Not XML because an end element occurs before a start element");
return false;
}
break;
}
if (nestingLevel == 0) {
++completeDocCount;
// Find the position that's one character beyond end of the end element.
// The next document (if there is one) must start after this (possibly
// preceeded by whitespace).
Location location = xmlReader.getLocation();
int endPos = 0;
// Line and column numbers start at 1, not 0
for (int wholeLines = location.getLineNumber() - 1; wholeLines > 0; --wholeLines) {
endPos = remainder.indexOf('\n', endPos) + 1;
if (endPos == 0) {
explanation.add("Not XML because XML parser location is inconsistent: line [" +
location.getLineNumber() + "], column [" + location.getColumnNumber() + "] in [" + remainder + "]");
return false;
}
}
endPos += location.getColumnNumber() - 1;
remainder = remainder.substring(endPos).trim();
mightBeAnotherDocument = !remainder.isEmpty();
break;
}
}
} finally {
xmlReader.close();
}
} catch (IOException | XMLStreamException e) {
explanation.add("Not XML because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
return false;
}
}
if (completeDocCount == 0) {
explanation.add("Not XML because sample didn't contain a complete document");
return false;
}
explanation.add("Deciding sample is XML");
return true;
}
@Override
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
throws IOException, ParserConfigurationException, SAXException {
return XmlLogStructureFinder.makeXmlLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker);
}
}

View File

@ -0,0 +1,38 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
public class CsvLogStructureFinderFactoryTests extends LogStructureTestCase {
private LogStructureFinderFactory factory = new CsvLogStructureFinderFactory();
// No need to check JSON or XML because they come earlier in the order we check formats
public void testCanCreateFromSampleGivenCsv() {
assertTrue(factory.canCreateFromSample(explanation, CSV_SAMPLE));
}
public void testCanCreateFromSampleGivenTsv() {
assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE));
}
public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
}
public void testCanCreateFromSampleGivenPipeSeparatedValues() {
assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
}
public void testCanCreateFromSampleGivenText() {
assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
}
}

View File

@ -0,0 +1,326 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.xpack.ml.logstructurefinder.GrokPatternCreator.ValueOnlyGrokPatternCandidate;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import static org.hamcrest.Matchers.containsInAnyOrder;
public class GrokPatternCreatorTests extends LogStructureTestCase {
public void testBuildFieldName() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
assertEquals("field", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
assertEquals("field2", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
assertEquals("field3", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
assertEquals("extra_timestamp", GrokPatternCreator.buildFieldName(fieldNameCountStore, "extra_timestamp"));
assertEquals("field4", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
assertEquals("uri", GrokPatternCreator.buildFieldName(fieldNameCountStore, "uri"));
assertEquals("extra_timestamp2", GrokPatternCreator.buildFieldName(fieldNameCountStore, "extra_timestamp"));
assertEquals("field5", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
}
public void testPopulatePrefacesAndEpiloguesGivenTimestamp() {
Collection<String> matchingStrings = Arrays.asList("[2018-01-25T15:33:23] DEBUG ",
"[2018-01-24T12:33:23] ERROR ",
"junk [2018-01-22T07:33:23] INFO ",
"[2018-01-21T03:33:23] DEBUG ");
ValueOnlyGrokPatternCandidate candidate = new ValueOnlyGrokPatternCandidate("TIMESTAMP_ISO8601", "date", "extra_timestamp");
Map<String, Integer> fieldNameCountStore = new HashMap<>();
Collection<String> prefaces = new ArrayList<>();
Collection<String> epilogues = new ArrayList<>();
candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null);
assertThat(prefaces, containsInAnyOrder("[", "[", "junk [", "["));
assertThat(epilogues, containsInAnyOrder("] DEBUG ", "] ERROR ", "] INFO ", "] DEBUG "));
}
public void testPopulatePrefacesAndEpiloguesGivenEmailAddress() {
Collection<String> matchingStrings = Arrays.asList("before alice@acme.com after",
"abc bob@acme.com xyz",
"carol@acme.com");
ValueOnlyGrokPatternCandidate candidate = new ValueOnlyGrokPatternCandidate("EMAILADDRESS", "keyword", "email");
Map<String, Integer> fieldNameCountStore = new HashMap<>();
Collection<String> prefaces = new ArrayList<>();
Collection<String> epilogues = new ArrayList<>();
candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null);
assertThat(prefaces, containsInAnyOrder("before ", "abc ", ""));
assertThat(epilogues, containsInAnyOrder(" after", " xyz", ""));
}
public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() {
Collection<String> snippets = Arrays.asList("[2018-01-25T15:33:23] DEBUG ",
"[2018-01-24T12:33:23] ERROR ",
"junk [2018-01-22T07:33:23] INFO ",
"[2018-01-21T03:33:23] DEBUG ");
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
assertEquals(".*?\\[%{TIMESTAMP_ISO8601:extra_timestamp}\\] %{LOGLEVEL:loglevel} ",
grokPatternCreator.getOverallGrokPatternBuilder().toString());
}
public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() {
Collection<String> snippets = Arrays.asList("(-2)",
" (-3)",
" (4)",
" (-5) ");
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
assertEquals(".*?\\(%{INT:field}\\).*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
}
public void testAppendBestGrokMatchForStringsGivenNegativeNumbersWithoutBreak() {
Collection<String> snippets = Arrays.asList("before-2 ",
"prior to-3",
"-4");
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
// It seems sensible that we don't detect these suffices as either base 10 or base 16 numbers
assertEquals(".*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
}
public void testAppendBestGrokMatchForStringsGivenHexNumbers() {
Collection<String> snippets = Arrays.asList(" abc",
" 123",
" -123",
"1f is hex");
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
assertEquals(".*?%{BASE16NUM:field}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
}
public void testAppendBestGrokMatchForStringsGivenHostnamesWithNumbers() {
Collection<String> snippets = Arrays.asList("<host1.1.p2ps:",
"<host2.1.p2ps:");
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
// We don't want the .1. in the middle to get detected as a hex number
assertEquals("<.*?:", grokPatternCreator.getOverallGrokPatternBuilder().toString());
}
public void testAppendBestGrokMatchForStringsGivenEmailAddresses() {
Collection<String> snippets = Arrays.asList("before alice@acme.com after",
"abc bob@acme.com xyz",
"carol@acme.com");
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
assertEquals(".*?%{EMAILADDRESS:email}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
}
public void testAppendBestGrokMatchForStringsGivenUris() {
Collection<String> snippets = Arrays.asList("main site https://www.elastic.co/ with trailing slash",
"https://www.elastic.co/guide/en/x-pack/current/ml-configuring-categories.html#ml-configuring-categories is a section",
"download today from https://www.elastic.co/downloads");
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
assertEquals(".*?%{URI:uri}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
}
public void testAppendBestGrokMatchForStringsGivenPaths() {
Collection<String> snippets = Arrays.asList("on Mac /Users/dave",
"on Windows C:\\Users\\dave",
"on Linux /home/dave");
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
assertEquals(".*? .*? %{PATH:path}", grokPatternCreator.getOverallGrokPatternBuilder().toString());
}
public void testAppendBestGrokMatchForStringsGivenKvPairs() {
Collection<String> snippets = Arrays.asList("foo=1 and bar=a",
"something foo=2 bar=b something else",
"foo=3 bar=c",
" foo=1 bar=a ");
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
assertEquals(".*?\\bfoo=%{USER:foo} .*?\\bbar=%{USER:bar}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
}
public void testCreateGrokPatternFromExamplesGivenNamedLogs() {
Collection<String> sampleMessages = Arrays.asList(
"Sep 8 11:55:06 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'elastic.slack.com/A/IN': 95.110.64.205#53",
"Sep 8 11:55:08 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'slack-imgs.com/A/IN': 95.110.64.205#53",
"Sep 8 11:55:35 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53",
"Sep 8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53");
Map<String, Object> mappings = new HashMap<>();
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
assertEquals("%{SYSLOGTIMESTAMP:timestamp} .*? .*?\\[%{INT:field}\\]: %{LOGLEVEL:loglevel} \\(.*? .*? .*?\\) .*? " +
"%{QUOTEDSTRING:field2}: %{IP:ipaddress}#%{INT:field3}",
grokPatternCreator.createGrokPatternFromExamples("SYSLOGTIMESTAMP", "timestamp"));
assertEquals(5, mappings.size());
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("field2"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field3"));
}
public void testCreateGrokPatternFromExamplesGivenCatalinaLogs() {
Collection<String> sampleMessages = Arrays.asList(
"Aug 29, 2009 12:03:33 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
"Invalid chunk ignored.",
"Aug 29, 2009 12:03:40 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
"Invalid chunk ignored.",
"Aug 29, 2009 12:03:45 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
"Invalid chunk ignored.",
"Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
"Invalid chunk ignored.");
Map<String, Object> mappings = new HashMap<>();
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
assertEquals("%{CATALINA_DATESTAMP:timestamp} .*? .*?\\n%{LOGLEVEL:loglevel}: .*",
grokPatternCreator.createGrokPatternFromExamples("CATALINA_DATESTAMP", "timestamp"));
assertEquals(1, mappings.size());
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel"));
}
public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogs() {
// Two timestamps: one local, one UTC
Collection<String> sampleMessages = Arrays.asList(
"559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" +
"Info\tsshd\tsubsystem request for sftp",
"559550912548986880\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" +
"Info\tsshd\tsubsystem request for sftp",
"559550912548986887\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" +
"Info\tsshd\tsubsystem request for sftp",
"559550912603512850\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" +
"Info\tsshd\tsubsystem request for sftp");
Map<String, Object> mappings = new HashMap<>();
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" +
"%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*",
grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp"));
assertEquals(5, mappings.size());
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"),
mappings.get("extra_timestamp"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field2"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel"));
}
public void testFindFullLineGrokPatternGivenApacheCombinedLogs() {
Collection<String> sampleMessages = Arrays.asList(
"83.149.9.216 - - [19/Jan/2016:08:13:42 +0000] " +
"\"GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1\" 200 203023 " +
"\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"",
"83.149.9.216 - - [19/Jan/2016:08:13:44 +0000] " +
"\"GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1\" 200 7697 " +
"\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"",
"83.149.9.216 - - [19/Jan/2016:08:13:44 +0000] " +
"\"GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1\" 200 26185 " +
"\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"",
"83.149.9.216 - - [19/Jan/2016:08:13:42 +0000] " +
"\"GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1\" 200 430406 " +
"\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"");
Map<String, Object> mappings = new HashMap<>();
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), grokPatternCreator.findFullLineGrokPattern());
assertEquals(10, mappings.size());
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "text"), mappings.get("agent"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("auth"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("bytes"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("clientip"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "double"), mappings.get("httpversion"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("ident"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("referrer"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("request"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("response"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("verb"));
}
public void testAdjustForPunctuationGivenCommonPrefix() {
Collection<String> snippets = Arrays.asList(
"\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.33.212\",\"No-lookup\",\"192.168.33.132\"," +
"\"80\",\"46721\",\"/Common/Subnet_33\",\"TCP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" +
",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"",
"\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.143.244\",\"No-lookup\",\"192.168.33.106\"," +
"\"55025\",\"162\",\"/Common/Subnet_33\",\"UDP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" +
",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"",
"\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.33.3\",\"No-lookup\",\"224.0.0.102\"," +
"\"3222\",\"3222\",\"/Common/Subnet_33\",\"UDP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" +
",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\""
);
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
Collection<String> adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets);
assertEquals("\",", grokPatternCreator.getOverallGrokPatternBuilder().toString());
assertNotNull(adjustedSnippets);
assertThat(new ArrayList<>(adjustedSnippets),
containsInAnyOrder(snippets.stream().map(snippet -> snippet.substring(2)).toArray(String[]::new)));
}
public void testAdjustForPunctuationGivenNoCommonPrefix() {
Collection<String> snippets = Arrays.asList(
"|client (id:2) was removed from servergroup 'Normal'(id:7) by client 'User1'(id:2)",
"|servergroup 'GAME'(id:9) was added by 'User1'(id:2)",
"|permission 'i_group_auto_update_type'(id:146) with values (value:30, negated:0, skipchannel:0) " +
"was added by 'User1'(id:2) to servergroup 'GAME'(id:9)"
);
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
Collection<String> adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets);
assertEquals("", grokPatternCreator.getOverallGrokPatternBuilder().toString());
assertSame(snippets, adjustedSnippets);
}
}

View File

@ -0,0 +1,46 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
public class JsonLogStructureFinderFactoryTests extends LogStructureTestCase {
private LogStructureFinderFactory factory = new JsonLogStructureFinderFactory();
public void testCanCreateFromSampleGivenJson() {
assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
}
public void testCanCreateFromSampleGivenXml() {
assertFalse(factory.canCreateFromSample(explanation, XML_SAMPLE));
}
public void testCanCreateFromSampleGivenCsv() {
assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE));
}
public void testCanCreateFromSampleGivenTsv() {
assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE));
}
public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
}
public void testCanCreateFromSampleGivenPipeSeparatedValues() {
assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
}
public void testCanCreateFromSampleGivenText() {
assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
}
}

View File

@ -0,0 +1,39 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import java.util.Collections;
public class JsonLogStructureFinderTests extends LogStructureTestCase {
private LogStructureFinderFactory factory = new JsonLogStructureFinderFactory();
public void testCreateConfigsGivenGoodJson() throws Exception {
assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
LogStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker);
LogStructure structure = structureFinder.getStructure();
assertEquals(LogStructure.Format.JSON, structure.getFormat());
assertEquals(charset, structure.getCharset());
if (hasByteOrderMarker == null) {
assertNull(structure.getHasByteOrderMarker());
} else {
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
}
assertNull(structure.getExcludeLinesPattern());
assertNull(structure.getMultilineStartPattern());
assertNull(structure.getSeparator());
assertNull(structure.getHasHeaderRow());
assertNull(structure.getShouldTrimFields());
assertNull(structure.getGrokPattern());
assertEquals("timestamp", structure.getTimestampField());
assertEquals(Collections.singletonList("UNIX_MS"), structure.getTimestampFormats());
}
}

View File

@ -0,0 +1,72 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import com.ibm.icu.text.CharsetMatch;
import java.io.ByteArrayInputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import static org.hamcrest.Matchers.startsWith;
import static org.hamcrest.core.IsInstanceOf.instanceOf;
public class LogStructureFinderManagerTests extends LogStructureTestCase {
private LogStructureFinderManager structureFinderManager = new LogStructureFinderManager();
public void testFindCharsetGivenCharacterWidths() throws Exception {
for (Charset charset : Arrays.asList(StandardCharsets.UTF_8, StandardCharsets.UTF_16LE, StandardCharsets.UTF_16BE)) {
CharsetMatch charsetMatch = structureFinderManager.findCharset(explanation,
new ByteArrayInputStream(TEXT_SAMPLE.getBytes(charset)));
assertEquals(charset.name(), charsetMatch.getName());
}
}
public void testFindCharsetGivenBinary() throws Exception {
// This input should never match a single byte character set. ICU4J will sometimes decide
// that it matches a double byte character set, hence the two assertion branches.
int size = 1000;
byte[] binaryBytes = randomByteArrayOfLength(size);
for (int i = 0; i < 10; ++i) {
binaryBytes[randomIntBetween(0, size - 1)] = 0;
}
try {
CharsetMatch charsetMatch = structureFinderManager.findCharset(explanation, new ByteArrayInputStream(binaryBytes));
assertThat(charsetMatch.getName(), startsWith("UTF-16"));
} catch (IllegalArgumentException e) {
assertEquals("Could not determine a usable character encoding for the input - could it be binary data?", e.getMessage());
}
}
public void testMakeBestStructureGivenJson() throws Exception {
assertThat(structureFinderManager.makeBestStructureFinder(explanation,
"{ \"time\": \"2018-05-17T13:41:23\", \"message\": \"hello\" }", StandardCharsets.UTF_8.name(), randomBoolean()),
instanceOf(JsonLogStructureFinder.class));
}
public void testMakeBestStructureGivenXml() throws Exception {
assertThat(structureFinderManager.makeBestStructureFinder(explanation,
"<log time=\"2018-05-17T13:41:23\"><message>hello</message></log>", StandardCharsets.UTF_8.name(), randomBoolean()),
instanceOf(XmlLogStructureFinder.class));
}
public void testMakeBestStructureGivenCsv() throws Exception {
assertThat(structureFinderManager.makeBestStructureFinder(explanation, "time,message\n" +
"2018-05-17T13:41:23,hello\n", StandardCharsets.UTF_8.name(), randomBoolean()),
instanceOf(SeparatedValuesLogStructureFinder.class));
}
public void testMakeBestStructureGivenText() throws Exception {
assertThat(structureFinderManager.makeBestStructureFinder(explanation, "[2018-05-17T13:41:23] hello\n" +
"[2018-05-17T13:41:24] hello again\n", StandardCharsets.UTF_8.name(), randomBoolean()),
instanceOf(TextLogStructureFinder.class));
}
}

View File

@ -0,0 +1,86 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.test.ESTestCase;
import org.junit.After;
import org.junit.Before;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;
public abstract class LogStructureTestCase extends ESTestCase {
protected static final List<String> POSSIBLE_CHARSETS = Collections.unmodifiableList(Charset.availableCharsets().keySet().stream()
.filter(name -> LogStructureFinderManager.FILEBEAT_SUPPORTED_ENCODINGS.contains(name.toLowerCase(Locale.ROOT)))
.collect(Collectors.toList()));
protected static final String CSV_SAMPLE = "time,id,value\n" +
"2018-05-17T16:23:40,key1,42.0\n" +
"2018-05-17T16:24:11,\"key with spaces\",42.0\n";
protected static final String JSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
"\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 1\",\"class\":\"ml\"," +
"\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n" +
"{\"logger\":\"controller\",\"timestamp\":1478261151445," +
"\"level\":\"INFO\",\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 2\",\"class\":\"ml\"," +
"\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n";
protected static final String PIPE_SEPARATED_VALUES_SAMPLE = "2018-01-06 16:56:14.295748|INFO |VirtualServer |1 |" +
"listening on 0.0.0.0:9987, :::9987\n" +
"2018-01-06 17:19:44.465252|INFO |VirtualServer |1 |client " +
"'User1'(id:2) changed default admin channelgroup to 'Guest'(id:8)\n" +
"2018-01-06 17:21:25.764368|INFO |VirtualServer |1 |client " +
"'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel 'Default Channel'(id:1)";
protected static final String SEMI_COLON_SEPARATED_VALUES_SAMPLE = "\"pos_id\";\"trip_id\";\"latitude\";\"longitude\";\"altitude\";" +
"\"timestamp\"\n" +
"\"1\";\"3\";\"4703.7815\";\"1527.4713\";\"359.9\";\"2017-01-19 16:19:04.742113\"\n" +
"\"2\";\"3\";\"4703.7815\";\"1527.4714\";\"359.9\";\"2017-01-19 16:19:05.741890\"\n" +
"\"3\";\"3\";\"4703.7816\";\"1527.4716\";\"360.3\";\"2017-01-19 16:19:06.738842\"";
protected static final String TEXT_SAMPLE = "[2018-05-11T17:07:29,461][INFO ][o.e.n.Node ] [node-0] initializing ...\n" +
"[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], " +
"net usable_space [223.4gb], net total_space [464.7gb], types [hfs]\n" +
"[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [3.9gb], " +
"compressed ordinary object pointers [true]\n" +
"[2018-05-11T17:07:29,556][INFO ][o.e.n.Node ] [node-0] node name [node-0], node ID [tJ9u8HcaTbWxRtnlfz1RQA]\n";
protected static final String TSV_SAMPLE = "time\tid\tvalue\n" +
"2018-05-17T16:23:40\tkey1\t42.0\n" +
"2018-05-17T16:24:11\t\"key with spaces\"\t42.0\n";
protected static final String XML_SAMPLE = "<log4j:event logger=\"autodetect\" timestamp=\"1526574809521\" level=\"ERROR\" " +
"thread=\"0x7fffc5a7c3c0\">\n" +
"<log4j:message><![CDATA[Neither a fieldname clause nor a field config file was specified]]></log4j:message>\n" +
"</log4j:event>\n" +
"\n" +
"<log4j:event logger=\"autodetect\" timestamp=\"1526574809522\" level=\"FATAL\" thread=\"0x7fffc5a7c3c0\">\n" +
"<log4j:message><![CDATA[Field config could not be interpreted]]></log4j:message>\n" +
"</log4j:event>\n" +
"\n";
protected List<String> explanation;
@Before
public void initExplanation() {
explanation = new ArrayList<>();
}
@After
public void printExplanation() {
Loggers.getLogger(getClass()).info("Explanation:\n" + String.join("\n", explanation));
}
protected Boolean randomHasByteOrderMarker(String charset) {
return charset.toUpperCase(Locale.ROOT).startsWith("UTF") ? randomBoolean() : null;
}
}

View File

@ -0,0 +1,83 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.test.AbstractXContentTestCase;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
public class LogStructureTests extends AbstractXContentTestCase<LogStructure> {
protected LogStructure createTestInstance() {
LogStructure.Format format = randomFrom(EnumSet.allOf(LogStructure.Format.class));
LogStructure.Builder builder = new LogStructure.Builder(format);
int numLinesAnalyzed = randomIntBetween(2, 10000);
builder.setNumLinesAnalyzed(numLinesAnalyzed);
int numMessagesAnalyzed = randomIntBetween(1, numLinesAnalyzed);
builder.setNumMessagesAnalyzed(numMessagesAnalyzed);
builder.setSampleStart(randomAlphaOfLength(1000));
String charset = randomFrom(Charset.availableCharsets().keySet());
builder.setCharset(charset);
if (charset.toUpperCase(Locale.ROOT).startsWith("UTF")) {
builder.setHasByteOrderMarker(randomBoolean());
}
if (numMessagesAnalyzed < numLinesAnalyzed) {
builder.setMultilineStartPattern(randomAlphaOfLength(100));
}
if (randomBoolean()) {
builder.setExcludeLinesPattern(randomAlphaOfLength(100));
}
if (format.isSeparatedValues() || (format.supportsNesting() && randomBoolean())) {
builder.setInputFields(Arrays.asList(generateRandomStringArray(10, 10, false, false)));
}
if (format.isSeparatedValues()) {
builder.setHasHeaderRow(randomBoolean());
if (rarely()) {
builder.setSeparator(format.separator());
}
}
if (format.isSemiStructured()) {
builder.setGrokPattern(randomAlphaOfLength(100));
}
if (format.isSemiStructured() || randomBoolean()) {
builder.setTimestampField(randomAlphaOfLength(10));
builder.setTimestampFormats(Arrays.asList(generateRandomStringArray(3, 20, false, false)));
builder.setNeedClientTimezone(randomBoolean());
}
Map<String, Object> mappings = new TreeMap<>();
for (String field : generateRandomStringArray(5, 20, false, false)) {
mappings.put(field, Collections.singletonMap(randomAlphaOfLength(5), randomAlphaOfLength(10)));
}
builder.setMappings(mappings);
builder.setExplanation(Arrays.asList(generateRandomStringArray(10, 150, false, false)));
return builder.build();
}
protected LogStructure doParseInstance(XContentParser parser) {
return LogStructure.PARSER.apply(parser, null).build();
}
protected boolean supportsUnknownFields() {
return false;
}
}

View File

@ -0,0 +1,292 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import static org.hamcrest.Matchers.contains;
public class LogStructureUtilsTests extends LogStructureTestCase {
public void testMoreLikelyGivenText() {
assertTrue(LogStructureUtils.isMoreLikelyTextThanKeyword("the quick brown fox jumped over the lazy dog"));
assertTrue(LogStructureUtils.isMoreLikelyTextThanKeyword(randomAlphaOfLengthBetween(257, 10000)));
}
public void testMoreLikelyGivenKeyword() {
assertFalse(LogStructureUtils.isMoreLikelyTextThanKeyword("1"));
assertFalse(LogStructureUtils.isMoreLikelyTextThanKeyword("DEBUG"));
assertFalse(LogStructureUtils.isMoreLikelyTextThanKeyword(randomAlphaOfLengthBetween(1, 256)));
}
public void testSingleSampleSingleField() {
Map<String, String> sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
Tuple<String, TimestampMatch> match =
LogStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample));
assertNotNull(match);
assertEquals("field1", match.v1());
assertThat(match.v2().dateFormats, contains("ISO8601"));
assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
}
public void testSamplesWithSameSingleTimeField() {
Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
Map<String, String> sample2 = Collections.singletonMap("field1", "2018-05-24T17:33:39,406");
Tuple<String, TimestampMatch> match =
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
assertNotNull(match);
assertEquals("field1", match.v1());
assertThat(match.v2().dateFormats, contains("ISO8601"));
assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
}
public void testSamplesWithOneSingleTimeFieldDifferentFormat() {
Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
Map<String, String> sample2 = Collections.singletonMap("field1", "2018-05-24 17:33:39,406");
Tuple<String, TimestampMatch> match =
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
assertNull(match);
}
public void testSamplesWithDifferentSingleTimeField() {
Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
Map<String, String> sample2 = Collections.singletonMap("another_field", "2018-05-24T17:33:39,406");
Tuple<String, TimestampMatch> match =
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
assertNull(match);
}
public void testSingleSampleManyFieldsOneTimeFormat() {
Map<String, Object> sample = new LinkedHashMap<>();
sample.put("foo", "not a time");
sample.put("time", "2018-05-24 17:28:31,735");
sample.put("bar", 42);
Tuple<String, TimestampMatch> match =
LogStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample));
assertNotNull(match);
assertEquals("time", match.v1());
assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
}
public void testSamplesWithManyFieldsSameSingleTimeFormat() {
Map<String, Object> sample1 = new LinkedHashMap<>();
sample1.put("foo", "not a time");
sample1.put("time", "2018-05-24 17:28:31,735");
sample1.put("bar", 42);
Map<String, Object> sample2 = new LinkedHashMap<>();
sample2.put("foo", "whatever");
sample2.put("time", "2018-05-29 11:53:02,837");
sample2.put("bar", 17);
Tuple<String, TimestampMatch> match =
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
assertNotNull(match);
assertEquals("time", match.v1());
assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
}
public void testSamplesWithManyFieldsSameTimeFieldDifferentTimeFormat() {
Map<String, Object> sample1 = new LinkedHashMap<>();
sample1.put("foo", "not a time");
sample1.put("time", "2018-05-24 17:28:31,735");
sample1.put("bar", 42);
Map<String, Object> sample2 = new LinkedHashMap<>();
sample2.put("foo", "whatever");
sample2.put("time", "May 29 2018 11:53:02");
sample2.put("bar", 17);
Tuple<String, TimestampMatch> match =
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
assertNull(match);
}
public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionBefore() {
Map<String, Object> sample1 = new LinkedHashMap<>();
sample1.put("red_herring", "May 29 2007 11:53:02");
sample1.put("time", "2018-05-24 17:28:31,735");
sample1.put("bar", 42);
Map<String, Object> sample2 = new LinkedHashMap<>();
sample2.put("red_herring", "whatever");
sample2.put("time", "2018-05-29 11:53:02,837");
sample2.put("bar", 17);
Tuple<String, TimestampMatch> match =
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
assertNotNull(match);
assertEquals("time", match.v1());
assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
}
public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionAfter() {
Map<String, Object> sample1 = new LinkedHashMap<>();
sample1.put("foo", "not a time");
sample1.put("time", "May 24 2018 17:28:31");
sample1.put("red_herring", "2018-05-24 17:28:31,735");
Map<String, Object> sample2 = new LinkedHashMap<>();
sample2.put("foo", "whatever");
sample2.put("time", "May 29 2018 11:53:02");
sample2.put("red_herring", "17");
Tuple<String, TimestampMatch> match =
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
assertNotNull(match);
assertEquals("time", match.v1());
assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"));
assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName);
}
public void testSamplesWithManyFieldsInconsistentTimeFields() {
Map<String, Object> sample1 = new LinkedHashMap<>();
sample1.put("foo", "not a time");
sample1.put("time1", "May 24 2018 17:28:31");
sample1.put("bar", 17);
Map<String, Object> sample2 = new LinkedHashMap<>();
sample2.put("foo", "whatever");
sample2.put("time2", "May 29 2018 11:53:02");
sample2.put("bar", 42);
Tuple<String, TimestampMatch> match =
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
assertNull(match);
}
public void testSamplesWithManyFieldsInconsistentAndConsistentTimeFields() {
Map<String, Object> sample1 = new LinkedHashMap<>();
sample1.put("foo", "not a time");
sample1.put("time1", "2018-05-09 17:28:31,735");
sample1.put("time2", "May 9 2018 17:28:31");
sample1.put("bar", 17);
Map<String, Object> sample2 = new LinkedHashMap<>();
sample2.put("foo", "whatever");
sample2.put("time2", "May 10 2018 11:53:02");
sample2.put("time3", "Thu, May 10 2018 11:53:02");
sample2.put("bar", 42);
Tuple<String, TimestampMatch> match =
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
assertNotNull(match);
assertEquals("time2", match.v1());
assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"));
assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName);
}
public void testGuessMappingGivenNothing() {
assertNull(LogStructureUtils.guessMapping(explanation, "foo", Collections.emptyList()));
}
public void testGuessMappingGivenKeyword() {
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword");
assertEquals(expected,
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("ERROR", "INFO", "DEBUG")));
assertEquals(expected,
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "not a date")));
}
public void testGuessMappingGivenText() {
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "text");
assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo",
Arrays.asList("a", "the quick brown fox jumped over the lazy dog")));
}
public void testGuessMappingGivenIp() {
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip");
assertEquals(expected,
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("10.0.0.1", "172.16.0.1", "192.168.0.1")));
}
public void testGuessMappingGivenDouble() {
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "double");
assertEquals(expected,
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("3.14159265359", "0", "-8")));
// 12345678901234567890 is too long for long
assertEquals(expected,
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("1", "2", "12345678901234567890")));
assertEquals(expected,
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(3.14159265359, 0.0, 1e-308)));
assertEquals(expected,
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("-1e-1", "-1e308", "1e-308")));
}
public void testGuessMappingGivenLong() {
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long");
assertEquals(expected,
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("500", "3", "-3")));
assertEquals(expected,
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(500, 6, 0)));
}
public void testGuessMappingGivenDate() {
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date");
assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo",
Arrays.asList("2018-06-11T13:26:47Z", "2018-06-11T13:27:12Z")));
}
public void testGuessMappingGivenBoolean() {
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "boolean");
assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("false", "true")));
assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(true, false)));
}
public void testGuessMappingGivenArray() {
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long");
assertEquals(expected,
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(42, Arrays.asList(1, -99))));
expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword");
assertEquals(expected,
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(new String[]{ "x", "y" }, "z")));
}
public void testGuessMappingGivenObject() {
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "object");
assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo",
Arrays.asList(Collections.singletonMap("name", "value1"), Collections.singletonMap("name", "value2"))));
}
public void testGuessMappingGivenObjectAndNonObject() {
RuntimeException e = expectThrows(RuntimeException.class, () -> LogStructureUtils.guessMapping(explanation,
"foo", Arrays.asList(Collections.singletonMap("name", "value1"), "value2")));
assertEquals("Field [foo] has both object and non-object values - this is not supported by Elasticsearch", e.getMessage());
}
public void testGuessMappings() {
Map<String, Object> sample1 = new LinkedHashMap<>();
sample1.put("foo", "not a time");
sample1.put("time", "2018-05-24 17:28:31,735");
sample1.put("bar", 42);
sample1.put("nothing", null);
Map<String, Object> sample2 = new LinkedHashMap<>();
sample2.put("foo", "whatever");
sample2.put("time", "2018-05-29 11:53:02,837");
sample2.put("bar", 17);
sample2.put("nothing", null);
Map<String, Object> mappings = LogStructureUtils.guessMappings(explanation, Arrays.asList(sample1, sample2));
assertNotNull(mappings);
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("foo"));
Map<String, String> expectedTimeMapping = new HashMap<>();
expectedTimeMapping.put(LogStructureUtils.MAPPING_TYPE_SETTING, "date");
expectedTimeMapping.put(LogStructureUtils.MAPPING_FORMAT_SETTING, "YYYY-MM-dd HH:mm:ss,SSS");
assertEquals(expectedTimeMapping, mappings.get("time"));
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("bar"));
assertNull(mappings.get("nothing"));
}
}

View File

@ -0,0 +1,23 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
public class PipeSeparatedValuesLogStructureFinderFactoryTests extends LogStructureTestCase {
private LogStructureFinderFactory factory = new PipeSeparatedValuesLogStructureFinderFactory();
// No need to check JSON, XML, CSV, TSV or semi-colon separated values because they come earlier in the order we check formats
public void testCanCreateFromSampleGivenPipeSeparatedValues() {
assertTrue(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
}
public void testCanCreateFromSampleGivenText() {
assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
}
}

View File

@ -0,0 +1,28 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
public class SemiColonSeparatedValuesLogStructureFinderFactoryTests extends LogStructureTestCase {
private LogStructureFinderFactory factory = new SemiColonSeparatedValuesLogStructureFinderFactory();
// No need to check JSON, XML, CSV or TSV because they come earlier in the order we check formats
public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
assertTrue(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
}
public void testCanCreateFromSampleGivenPipeSeparatedValues() {
assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
}
public void testCanCreateFromSampleGivenText() {
assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
}
}

View File

@ -0,0 +1,293 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.collect.Tuple;
import org.supercsv.prefs.CsvPreference;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import static org.elasticsearch.xpack.ml.logstructurefinder.SeparatedValuesLogStructureFinder.levenshteinFieldwiseCompareRows;
import static org.elasticsearch.xpack.ml.logstructurefinder.SeparatedValuesLogStructureFinder.levenshteinDistance;
import static org.hamcrest.Matchers.arrayContaining;
public class SeparatedValuesLogStructureFinderTests extends LogStructureTestCase {
private LogStructureFinderFactory factory = new CsvLogStructureFinderFactory();
public void testCreateConfigsGivenCompleteCsv() throws Exception {
String sample = "time,message\n" +
"2018-05-17T13:41:23,hello\n" +
"2018-05-17T13:41:32,hello again\n";
assertTrue(factory.canCreateFromSample(explanation, sample));
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
LogStructure structure = structureFinder.getStructure();
assertEquals(LogStructure.Format.CSV, structure.getFormat());
assertEquals(charset, structure.getCharset());
if (hasByteOrderMarker == null) {
assertNull(structure.getHasByteOrderMarker());
} else {
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
}
assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern());
assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
assertEquals(Character.valueOf(','), structure.getSeparator());
assertTrue(structure.getHasHeaderRow());
assertNull(structure.getShouldTrimFields());
assertEquals(Arrays.asList("time", "message"), structure.getInputFields());
assertNull(structure.getGrokPattern());
assertEquals("time", structure.getTimestampField());
assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
}
public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception {
String sample = "message,time,count\n" +
"\"hello\n" +
"world\",2018-05-17T13:41:23,1\n" +
"\"hello again\n"; // note that this last record is truncated
assertTrue(factory.canCreateFromSample(explanation, sample));
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
LogStructure structure = structureFinder.getStructure();
assertEquals(LogStructure.Format.CSV, structure.getFormat());
assertEquals(charset, structure.getCharset());
if (hasByteOrderMarker == null) {
assertNull(structure.getHasByteOrderMarker());
} else {
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
}
assertEquals("^\"?message\"?,\"?time\"?,\"?count\"?", structure.getExcludeLinesPattern());
assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
assertEquals(Character.valueOf(','), structure.getSeparator());
assertTrue(structure.getHasHeaderRow());
assertNull(structure.getShouldTrimFields());
assertEquals(Arrays.asList("message", "time", "count"), structure.getInputFields());
assertNull(structure.getGrokPattern());
assertEquals("time", structure.getTimestampField());
assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
}
public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception {
String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," +
"store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," +
"improvement_surcharge,total_amount,,\n" +
"2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
"1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
"1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
assertTrue(factory.canCreateFromSample(explanation, sample));
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
LogStructure structure = structureFinder.getStructure();
assertEquals(LogStructure.Format.CSV, structure.getFormat());
assertEquals(charset, structure.getCharset());
if (hasByteOrderMarker == null) {
assertNull(structure.getHasByteOrderMarker());
} else {
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
}
assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," +
"\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," +
"\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?,\"?\"?,\"?\"?",
structure.getExcludeLinesPattern());
assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
assertEquals(Character.valueOf(','), structure.getSeparator());
assertTrue(structure.getHasHeaderRow());
assertNull(structure.getShouldTrimFields());
assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance",
"RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax",
"tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "column18", "column19"), structure.getInputFields());
assertNull(structure.getGrokPattern());
assertEquals("tpep_pickup_datetime", structure.getTimestampField());
assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
}
public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exception {
String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," +
"store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," +
"improvement_surcharge,total_amount\n" +
"2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
"1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
"1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
assertTrue(factory.canCreateFromSample(explanation, sample));
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
LogStructure structure = structureFinder.getStructure();
assertEquals(LogStructure.Format.CSV, structure.getFormat());
assertEquals(charset, structure.getCharset());
if (hasByteOrderMarker == null) {
assertNull(structure.getHasByteOrderMarker());
} else {
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
}
assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," +
"\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," +
"\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?",
structure.getExcludeLinesPattern());
assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
assertEquals(Character.valueOf(','), structure.getSeparator());
assertTrue(structure.getHasHeaderRow());
assertNull(structure.getShouldTrimFields());
assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance",
"RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax",
"tip_amount", "tolls_amount", "improvement_surcharge", "total_amount"), structure.getInputFields());
assertNull(structure.getGrokPattern());
assertEquals("tpep_pickup_datetime", structure.getTimestampField());
assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
}
public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception {
String sample = "\"pos_id\",\"trip_id\",\"latitude\",\"longitude\",\"altitude\",\"timestamp\"\n" +
"\"1\",\"3\",\"4703.7815\",\"1527.4713\",\"359.9\",\"2017-01-19 16:19:04.742113\"\n" +
"\"2\",\"3\",\"4703.7815\",\"1527.4714\",\"359.9\",\"2017-01-19 16:19:05.741890\"\n";
assertTrue(factory.canCreateFromSample(explanation, sample));
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
LogStructure structure = structureFinder.getStructure();
assertEquals(LogStructure.Format.CSV, structure.getFormat());
assertEquals(charset, structure.getCharset());
if (hasByteOrderMarker == null) {
assertNull(structure.getHasByteOrderMarker());
} else {
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
}
assertEquals("^\"?pos_id\"?,\"?trip_id\"?,\"?latitude\"?,\"?longitude\"?,\"?altitude\"?,\"?timestamp\"?",
structure.getExcludeLinesPattern());
assertNull(structure.getMultilineStartPattern());
assertEquals(Character.valueOf(','), structure.getSeparator());
assertTrue(structure.getHasHeaderRow());
assertNull(structure.getShouldTrimFields());
assertEquals(Arrays.asList("pos_id", "trip_id", "latitude", "longitude", "altitude", "timestamp"), structure.getInputFields());
assertNull(structure.getGrokPattern());
assertEquals("timestamp", structure.getTimestampField());
assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss.SSSSSS"), structure.getTimestampFormats());
}
public void testFindHeaderFromSampleGivenHeaderInSample() throws IOException {
String withHeader = "time,airline,responsetime,sourcetype\n" +
"2014-06-23 00:00:00Z,AAL,132.2046,farequote\n" +
"2014-06-23 00:00:00Z,JZA,990.4628,farequote\n" +
"2014-06-23 00:00:01Z,JBU,877.5927,farequote\n" +
"2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n";
Tuple<Boolean, String[]> header = SeparatedValuesLogStructureFinder.findHeaderFromSample(explanation,
SeparatedValuesLogStructureFinder.readRows(withHeader, CsvPreference.EXCEL_PREFERENCE).v1());
assertTrue(header.v1());
assertThat(header.v2(), arrayContaining("time", "airline", "responsetime", "sourcetype"));
}
public void testFindHeaderFromSampleGivenHeaderNotInSample() throws IOException {
String withoutHeader = "2014-06-23 00:00:00Z,AAL,132.2046,farequote\n" +
"2014-06-23 00:00:00Z,JZA,990.4628,farequote\n" +
"2014-06-23 00:00:01Z,JBU,877.5927,farequote\n" +
"2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n";
Tuple<Boolean, String[]> header = SeparatedValuesLogStructureFinder.findHeaderFromSample(explanation,
SeparatedValuesLogStructureFinder.readRows(withoutHeader, CsvPreference.EXCEL_PREFERENCE).v1());
assertFalse(header.v1());
assertThat(header.v2(), arrayContaining("column1", "column2", "column3", "column4"));
}
public void testLevenshteinDistance() {
assertEquals(0, levenshteinDistance("cat", "cat"));
assertEquals(3, levenshteinDistance("cat", "dog"));
assertEquals(5, levenshteinDistance("cat", "mouse"));
assertEquals(3, levenshteinDistance("cat", ""));
assertEquals(3, levenshteinDistance("dog", "cat"));
assertEquals(0, levenshteinDistance("dog", "dog"));
assertEquals(4, levenshteinDistance("dog", "mouse"));
assertEquals(3, levenshteinDistance("dog", ""));
assertEquals(5, levenshteinDistance("mouse", "cat"));
assertEquals(4, levenshteinDistance("mouse", "dog"));
assertEquals(0, levenshteinDistance("mouse", "mouse"));
assertEquals(5, levenshteinDistance("mouse", ""));
assertEquals(3, levenshteinDistance("", "cat"));
assertEquals(3, levenshteinDistance("", "dog"));
assertEquals(5, levenshteinDistance("", "mouse"));
assertEquals(0, levenshteinDistance("", ""));
}
public void testLevenshteinCompareRows() {
assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "dog")));
assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat")));
assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat")));
assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat")));
assertEquals(5, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat")));
assertEquals(4, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse")));
assertEquals(7, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog")));
}
public void testLineHasUnescapedQuote() {
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,b,c", CsvPreference.EXCEL_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\",b,c", CsvPreference.EXCEL_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a,b\",c", CsvPreference.EXCEL_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a,b,c\"", CsvPreference.EXCEL_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,\"b\",c", CsvPreference.EXCEL_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,b,\"c\"", CsvPreference.EXCEL_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,\"b\"\"\",c", CsvPreference.EXCEL_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,b,\"c\"\"\"", CsvPreference.EXCEL_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"\"\"a\",b,c", CsvPreference.EXCEL_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\"\"\",b,c", CsvPreference.EXCEL_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a,\"\"b\",c", CsvPreference.EXCEL_PREFERENCE));
assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("between\"words,b,c", CsvPreference.EXCEL_PREFERENCE));
assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("x and \"y\",b,c", CsvPreference.EXCEL_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\tb\tc", CsvPreference.TAB_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\"\tb\tc", CsvPreference.TAB_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\tb\"\tc", CsvPreference.TAB_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\tb\tc\"", CsvPreference.TAB_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\t\"b\"\tc", CsvPreference.TAB_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\tb\t\"c\"", CsvPreference.TAB_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\t\"b\"\"\"\tc", CsvPreference.TAB_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\tb\t\"c\"\"\"", CsvPreference.TAB_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"\"\"a\"\tb\tc", CsvPreference.TAB_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\"\"\"\tb\tc", CsvPreference.TAB_PREFERENCE));
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\t\"\"b\"\tc", CsvPreference.TAB_PREFERENCE));
assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("between\"words\tb\tc", CsvPreference.TAB_PREFERENCE));
assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("x and \"y\"\tb\tc", CsvPreference.TAB_PREFERENCE));
}
public void testRowContainsDuplicateNonEmptyValues() {
assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList("a")));
assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList("")));
assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "c")));
assertTrue(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "a")));
assertTrue(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "b")));
assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "", "")));
assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("", "a", "")));
}
}

View File

@ -0,0 +1,19 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
public class TextLogStructureFinderFactoryTests extends LogStructureTestCase {
private LogStructureFinderFactory factory = new TextLogStructureFinderFactory();
// No need to check JSON, XML, CSV, TSV, semi-colon separated values or pipe
// separated values because they come earlier in the order we check formats
public void testCanCreateFromSampleGivenText() {
assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
}
}

View File

@ -0,0 +1,245 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
import java.util.Collections;
import java.util.Set;
public class TextLogStructureFinderTests extends LogStructureTestCase {
private LogStructureFinderFactory factory = new TextLogStructureFinderFactory();
public void testCreateConfigsGivenElasticsearchLog() throws Exception {
assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
LogStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker);
LogStructure structure = structureFinder.getStructure();
assertEquals(LogStructure.Format.SEMI_STRUCTURED_TEXT, structure.getFormat());
assertEquals(charset, structure.getCharset());
if (hasByteOrderMarker == null) {
assertNull(structure.getHasByteOrderMarker());
} else {
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
}
assertNull(structure.getExcludeLinesPattern());
assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
assertNull(structure.getSeparator());
assertNull(structure.getHasHeaderRow());
assertNull(structure.getShouldTrimFields());
assertEquals("\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} \\]\\[.*", structure.getGrokPattern());
assertEquals("timestamp", structure.getTimestampField());
assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
}
public void testCreateMultiLineMessageStartRegexGivenNoPrefaces() {
for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
assertEquals("^" + simpleDateRegex.replaceFirst("^\\\\b", ""),
TextLogStructureFinder.createMultiLineMessageStartRegex(Collections.emptySet(), simpleDateRegex));
}
}
public void testCreateMultiLineMessageStartRegexGivenOneEmptyPreface() {
for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
assertEquals("^" + simpleDateRegex.replaceFirst("^\\\\b", ""),
TextLogStructureFinder.createMultiLineMessageStartRegex(Collections.singleton(""), simpleDateRegex));
}
}
public void testCreateMultiLineMessageStartRegexGivenOneLogLevelPreface() {
for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
assertEquals("^\\[.*?\\] \\[" + simpleDateRegex,
TextLogStructureFinder.createMultiLineMessageStartRegex(Collections.singleton("[ERROR] ["), simpleDateRegex));
}
}
public void testCreateMultiLineMessageStartRegexGivenManyLogLevelPrefaces() {
for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
Set<String> prefaces = Sets.newHashSet("[ERROR] [", "[DEBUG] [");
String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
assertEquals("^\\[.*?\\] \\[" + simpleDateRegex,
TextLogStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex));
}
}
public void testCreateMultiLineMessageStartRegexGivenManyHostnamePrefaces() {
for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
Set<String> prefaces = Sets.newHashSet("host-1.acme.com|", "my_host.elastic.co|");
String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
assertEquals("^.*?\\|" + simpleDateRegex,
TextLogStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex));
}
}
public void testCreateMultiLineMessageStartRegexGivenManyPrefacesIncludingEmpty() {
for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
Set<String> prefaces = Sets.newHashSet("", "[non-standard] ");
String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
assertEquals("^.*?" + simpleDateRegex,
TextLogStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex));
}
}
public void testMostLikelyTimestampGivenAllSame() {
String sample = "[2018-06-27T11:59:22,125][INFO ][o.e.n.Node ] [node-0] initializing ...\n" +
"[2018-06-27T11:59:22,201][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], " +
"net usable_space [216.1gb], net total_space [464.7gb], types [hfs]\n" +
"[2018-06-27T11:59:22,202][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [494.9mb], " +
"compressed ordinary object pointers [true]\n" +
"[2018-06-27T11:59:22,204][INFO ][o.e.n.Node ] [node-0] node name [node-0], node ID [Ha1gD8nNSDqjd6PIyu3DJA]\n" +
"[2018-06-27T11:59:22,204][INFO ][o.e.n.Node ] [node-0] version[6.4.0-SNAPSHOT], pid[2785], " +
"build[default/zip/3c60efa/2018-06-26T14:55:15.206676Z], OS[Mac OS X/10.12.6/x86_64], " +
"JVM[\"Oracle Corporation\"/Java HotSpot(TM) 64-Bit Server VM/10/10+46]\n" +
"[2018-06-27T11:59:22,205][INFO ][o.e.n.Node ] [node-0] JVM arguments [-Xms1g, -Xmx1g, " +
"-XX:+UseConcMarkSweepGC, -XX:CMSInitiatingOccupancyFraction=75, -XX:+UseCMSInitiatingOccupancyOnly, " +
"-XX:+AlwaysPreTouch, -Xss1m, -Djava.awt.headless=true, -Dfile.encoding=UTF-8, -Djna.nosys=true, " +
"-XX:-OmitStackTraceInFastThrow, -Dio.netty.noUnsafe=true, -Dio.netty.noKeySetOptimization=true, " +
"-Dio.netty.recycler.maxCapacityPerThread=0, -Dlog4j.shutdownHookEnabled=false, -Dlog4j2.disable.jmx=true, " +
"-Djava.io.tmpdir=/var/folders/k5/5sqcdlps5sg3cvlp783gcz740000h0/T/elasticsearch.nFUyeMH1, " +
"-XX:+HeapDumpOnOutOfMemoryError, -XX:HeapDumpPath=data, -XX:ErrorFile=logs/hs_err_pid%p.log, " +
"-Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m, " +
"-Djava.locale.providers=COMPAT, -Dio.netty.allocator.type=unpooled, -ea, -esa, -Xms512m, -Xmx512m, " +
"-Des.path.home=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT, " +
"-Des.path.conf=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT/config, " +
"-Des.distribution.flavor=default, -Des.distribution.type=zip]\n" +
"[2018-06-27T11:59:22,205][WARN ][o.e.n.Node ] [node-0] version [6.4.0-SNAPSHOT] is a pre-release version of " +
"Elasticsearch and is not suitable for production\n" +
"[2018-06-27T11:59:23,585][INFO ][o.e.p.PluginsService ] [node-0] loaded module [aggs-matrix-stats]\n" +
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [analysis-common]\n" +
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [ingest-common]\n" +
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-expression]\n" +
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-mustache]\n" +
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-painless]\n" +
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [mapper-extras]\n" +
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [parent-join]\n" +
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [percolator]\n" +
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [rank-eval]\n" +
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [reindex]\n" +
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [repository-url]\n" +
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [transport-netty4]\n" +
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-core]\n" +
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-deprecation]\n" +
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-graph]\n" +
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-logstash]\n" +
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-ml]\n" +
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-monitoring]\n" +
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-rollup]\n" +
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-security]\n" +
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-sql]\n" +
"[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-upgrade]\n" +
"[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-watcher]\n" +
"[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] no plugins loaded\n";
Tuple<TimestampMatch, Set<String>> mostLikelyMatch = TextLogStructureFinder.mostLikelyTimestamp(sample.split("\n"));
assertNotNull(mostLikelyMatch);
assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""),
mostLikelyMatch.v1());
}
public void testMostLikelyTimestampGivenExceptionTrace() {
String sample = "[2018-02-28T14:49:40,517][DEBUG][o.e.a.b.TransportShardBulkAction] [an_index][2] failed to execute bulk item " +
"(index) BulkShardRequest [[an_index][2]] containing [33] requests\n" +
"java.lang.IllegalArgumentException: Document contains at least one immense term in field=\"message.keyword\" (whose UTF8 " +
"encoding is longer than the max length 32766), all of which were skipped. Please correct the analyzer to not produce " +
"such terms. The prefix of the first immense term is: '[60, 83, 79, 65, 80, 45, 69, 78, 86, 58, 69, 110, 118, 101, 108, " +
"111, 112, 101, 32, 120, 109, 108, 110, 115, 58, 83, 79, 65, 80, 45]...', original message: bytes can be at most 32766 " +
"in length; got 49023\n" +
"\tat org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) " +
"~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
"\tat org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430) " +
"~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
"\tat org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392) " +
"~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
"\tat org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:240) " +
"~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
"\tat org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:496) " +
"~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
"\tat org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1729) " +
"~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
"\tat org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1464) " +
"~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
"\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:1070) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:1012) " +
"~[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:878) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:738) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:707) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperationOnPrimary(IndexShard.java:673) " +
"~[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:548) " +
"~[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequest(TransportShardBulkAction.java:140) " +
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:236) " +
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:123) " +
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:110) " +
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:72) " +
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" +
"(TransportReplicationAction.java:1034) [elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" +
"(TransportReplicationAction.java:1012) [elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:103) " +
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" +
"(TransportReplicationAction.java:359) [elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" +
"(TransportReplicationAction.java:299) [elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" +
"(TransportReplicationAction.java:975) [elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" +
"(TransportReplicationAction.java:972) [elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:238) " +
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationPermit(IndexShard.java:2220) " +
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference" +
"(TransportReplicationAction.java:984) [elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction.access$500(TransportReplicationAction.java:98) " +
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun" +
"(TransportReplicationAction.java:320) [elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " +
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" +
".messageReceived(TransportReplicationAction.java:295) [elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" +
".messageReceived(TransportReplicationAction.java:282) [elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) " +
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:656) " +
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:635) " +
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " +
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
"\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_144]\n" +
"\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_144]\n" +
"\tat java.lang.Thread.run(Thread.java:748) [?:1.8.0_144]\n";
Tuple<TimestampMatch, Set<String>> mostLikelyMatch = TextLogStructureFinder.mostLikelyTimestamp(sample.split("\n"));
assertNotNull(mostLikelyMatch);
// Even though many lines have a timestamp near the end (in the Lucene version information),
// these are so far along the lines that the weight of the timestamp near the beginning of the
// first line should take precedence
assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""),
mostLikelyMatch.v1());
}
}

View File

@ -0,0 +1,242 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import java.util.Arrays;
import java.util.Locale;
public class TimestampFormatFinderTests extends LogStructureTestCase {
public void testFindFirstMatchGivenNoMatch() {
assertNull(TimestampFormatFinder.findFirstMatch(""));
assertNull(TimestampFormatFinder.findFirstMatch("no timestamps in here"));
assertNull(TimestampFormatFinder.findFirstMatch(":::"));
assertNull(TimestampFormatFinder.findFirstMatch("/+"));
}
public void testFindFirstMatchGivenOnlyIso8601() {
TimestampMatch expected = new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601",
"");
checkAndValidateDateFormat(expected, "2018-05-15T16:14:56,374Z", 1526400896374L);
checkAndValidateDateFormat(expected, "2018-05-15T17:14:56,374+0100", 1526400896374L);
checkAndValidateDateFormat(expected, "2018-05-15T17:14:56,374+01:00", 1526400896374L);
checkAndValidateDateFormat(expected, "2018-05-15T17:14:56,374", 1526400896374L);
checkAndValidateDateFormat(expected, "2018-05-15T16:14:56Z", 1526400896000L);
checkAndValidateDateFormat(expected, "2018-05-15T17:14:56+0100", 1526400896000L);
checkAndValidateDateFormat(expected, "2018-05-15T17:14:56+01:00", 1526400896000L);
checkAndValidateDateFormat(expected, "2018-05-15T17:14:56", 1526400896000L);
checkAndValidateDateFormat(new TimestampMatch(1, "", "YYYY-MM-dd HH:mm:ss,SSSZ",
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 16:14:56,374Z",
1526400896374L);
checkAndValidateDateFormat(new TimestampMatch(1, "", "YYYY-MM-dd HH:mm:ss,SSSZ",
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374+0100",
1526400896374L);
checkAndValidateDateFormat(new TimestampMatch(2, "", "YYYY-MM-dd HH:mm:ss,SSSZZ",
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374+01:00",
1526400896374L);
checkAndValidateDateFormat(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss,SSS",
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374", 1526400896374L);
checkAndValidateDateFormat(new TimestampMatch(4, "", "YYYY-MM-dd HH:mm:ssZ",
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 16:14:56Z", 1526400896000L);
checkAndValidateDateFormat(new TimestampMatch(4, "", "YYYY-MM-dd HH:mm:ssZ",
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56+0100", 1526400896000L);
checkAndValidateDateFormat(new TimestampMatch(5, "", "YYYY-MM-dd HH:mm:ssZZ",
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56+01:00", 1526400896000L);
checkAndValidateDateFormat(new TimestampMatch(6, "", "YYYY-MM-dd HH:mm:ss",
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56", 1526400896000L);
}
public void testFindFirstMatchGivenOnlyKnownDateFormat() {
// Note: some of the time formats give millisecond accuracy, some second accuracy and some minute accuracy
checkAndValidateDateFormat(new TimestampMatch(0, "", "YYYY-MM-dd HH:mm:ss,SSS Z",
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TOMCAT_DATESTAMP", ""), "2018-05-15 17:14:56,374 +0100",
1526400896374L);
checkAndValidateDateFormat(new TimestampMatch(8, "", "EEE MMM dd YYYY HH:mm:ss zzz",
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC822", ""),
"Tue May 15 2018 16:14:56 UTC", 1526400896000L);
checkAndValidateDateFormat(new TimestampMatch(9, "", "EEE MMM dd YYYY HH:mm zzz",
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC822", ""),
"Tue May 15 2018 16:14 UTC", 1526400840000L);
checkAndValidateDateFormat(new TimestampMatch(10, "", "EEE, dd MMM YYYY HH:mm:ss ZZ",
"\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""),
"Tue, 15 May 2018 17:14:56 +01:00", 1526400896000L);
checkAndValidateDateFormat(new TimestampMatch(11, "", "EEE, dd MMM YYYY HH:mm:ss Z",
"\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""),
"Tue, 15 May 2018 17:14:56 +0100", 1526400896000L);
checkAndValidateDateFormat(new TimestampMatch(12, "", "EEE, dd MMM YYYY HH:mm ZZ",
"\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""),
"Tue, 15 May 2018 17:14 +01:00", 1526400840000L);
checkAndValidateDateFormat(new TimestampMatch(13, "", "EEE, dd MMM YYYY HH:mm Z",
"\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""), "Tue, 15 May 2018 17:14 +0100",
1526400840000L);
checkAndValidateDateFormat(new TimestampMatch(14, "", "EEE MMM dd HH:mm:ss zzz YYYY",
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", "DATESTAMP_OTHER", ""),
"Tue May 15 16:14:56 UTC 2018", 1526400896000L);
checkAndValidateDateFormat(new TimestampMatch(15, "", "EEE MMM dd HH:mm zzz YYYY",
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", "DATESTAMP_OTHER", ""),
"Tue May 15 16:14 UTC 2018", 1526400840000L);
checkAndValidateDateFormat(new TimestampMatch(16, "", "YYYYMMddHHmmss", "\\b\\d{14}\\b", "DATESTAMP_EVENTLOG", ""),
"20180515171456", 1526400896000L);
checkAndValidateDateFormat(new TimestampMatch(17, "", "EEE MMM dd HH:mm:ss YYYY",
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", "HTTPDERROR_DATE", ""),
"Tue May 15 17:14:56 2018", 1526400896000L);
checkAndValidateDateFormat(new TimestampMatch(18, "", Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"),
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}", "SYSLOGTIMESTAMP", ""), "May 15 17:14:56.725", 1526400896725L);
checkAndValidateDateFormat(new TimestampMatch(19, "", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"),
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP", ""), "May 15 17:14:56", 1526400896000L);
checkAndValidateDateFormat(new TimestampMatch(20, "", "dd/MMM/YYYY:HH:mm:ss Z",
"\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "HTTPDATE", ""), "15/May/2018:17:14:56 +0100", 1526400896000L);
checkAndValidateDateFormat(new TimestampMatch(21, "", "MMM dd, YYYY K:mm:ss a",
"\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "CATALINA_DATESTAMP", ""), "May 15, 2018 5:14:56 PM",
1526400896000L);
checkAndValidateDateFormat(new TimestampMatch(22, "", Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"),
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", "CISCOTIMESTAMP", ""), "May 15 2018 17:14:56",
1526400896000L);
}
public void testFindFirstMatchGivenOnlySystemDate() {
assertEquals(new TimestampMatch(23, "", "UNIX_MS", "\\b\\d{13}\\b", "POSINT", ""),
TimestampFormatFinder.findFirstMatch("1526400896374"));
assertEquals(new TimestampMatch(23, "", "UNIX_MS", "\\b\\d{13}\\b", "POSINT", ""),
TimestampFormatFinder.findFirstFullMatch("1526400896374"));
assertEquals(new TimestampMatch(24, "", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "NUMBER", ""),
TimestampFormatFinder.findFirstMatch("1526400896.736"));
assertEquals(new TimestampMatch(24, "", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "NUMBER", ""),
TimestampFormatFinder.findFirstFullMatch("1526400896.736"));
assertEquals(new TimestampMatch(25, "", "UNIX", "\\b\\d{10}\\b", "POSINT", ""),
TimestampFormatFinder.findFirstMatch("1526400896"));
assertEquals(new TimestampMatch(25, "", "UNIX", "\\b\\d{10}\\b", "POSINT", ""),
TimestampFormatFinder.findFirstFullMatch("1526400896"));
assertEquals(new TimestampMatch(26, "", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM", ""),
TimestampFormatFinder.findFirstMatch("400000005afb159a164ac980"));
assertEquals(new TimestampMatch(26, "", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM", ""),
TimestampFormatFinder.findFirstFullMatch("400000005afb159a164ac980"));
}
private void checkAndValidateDateFormat(TimestampMatch expected, String text, long expectedEpochMs) {
assertEquals(expected, TimestampFormatFinder.findFirstMatch(text));
assertEquals(expected, TimestampFormatFinder.findFirstFullMatch(text));
// All the test times are for Tue May 15 2018 16:14:56 UTC, which is 17:14:56 in London
DateTimeZone zone = DateTimeZone.forID("Europe/London");
DateTime parsed;
for (int i = 0; i < expected.dateFormats.size(); ++i) {
try {
String dateFormat = expected.dateFormats.get(i);
switch (dateFormat) {
case "ISO8601":
parsed = ISODateTimeFormat.dateTimeParser().withZone(zone).withDefaultYear(2018).parseDateTime(text);
break;
default:
DateTimeFormatter parser = DateTimeFormat.forPattern(dateFormat).withZone(zone).withLocale(Locale.UK);
parsed = parser.withDefaultYear(2018).parseDateTime(text);
break;
}
if (expectedEpochMs == parsed.getMillis()) {
break;
}
// If the last one isn't right then propagate
if (i == expected.dateFormats.size() - 1) {
assertEquals(expectedEpochMs, parsed.getMillis());
}
} catch (RuntimeException e) {
// If the last one throws then propagate
if (i == expected.dateFormats.size() - 1) {
throw e;
}
}
}
assertTrue(expected.simplePattern.matcher(text).find());
}
public void testFindFirstMatchGivenRealLogMessages() {
assertEquals(new TimestampMatch(7, "[", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601",
"][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [3.9gb], compressed ordinary object pointers [true]"),
TimestampFormatFinder.findFirstMatch("[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment ] [node-0] " +
"heap size [3.9gb], compressed ordinary object pointers [true]"));
assertEquals(new TimestampMatch(20, "192.168.62.101 - - [", "dd/MMM/YYYY:HH:mm:ss Z",
"\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "HTTPDATE",
"] \"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384"),
TimestampFormatFinder.findFirstMatch("192.168.62.101 - - [29/Jun/2016:12:11:31 +0000] " +
"\"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384"));
assertEquals(new TimestampMatch(21, "", "MMM dd, YYYY K:mm:ss a",
"\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "CATALINA_DATESTAMP",
" org.apache.tomcat.util.http.Parameters processParameters"),
TimestampFormatFinder.findFirstMatch("Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters"));
assertEquals(new TimestampMatch(19, "", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"),
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP", " esxi1.acme.com Vpxa: " +
"[3CB3FB90 verbose 'vpxavpxaInvtVm' opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed"),
TimestampFormatFinder.findFirstMatch("Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' " +
"opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed"));
assertEquals(new TimestampMatch(7, "559550912540598297\t", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}",
"TIMESTAMP_ISO8601",
"\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp"),
TimestampFormatFinder.findFirstMatch("559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t" +
"192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp"));
assertEquals(new TimestampMatch(19, "", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"),
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP",
" dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53"),
TimestampFormatFinder.findFirstMatch("Sep 8 11:55:35 dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving " +
"'www.elastic.co/A/IN': 95.110.68.206#53"));
assertEquals(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss.SSSSSS", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}",
"TIMESTAMP_ISO8601",
"|INFO |VirtualServer |1 |client 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client " +
"'User1'(id:2) in channel '3er Instanz'(id:2)"),
TimestampFormatFinder.findFirstMatch("2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " +
" 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)"));
}
public void testInterpretFractionalSeconds() {
assertEquals(new Tuple<>(',', 0), TimestampFormatFinder.interpretFractionalSeconds("Sep 8 11:55:35"));
assertEquals(new Tuple<>(',', 0), TimestampFormatFinder.interpretFractionalSeconds("29/Jun/2016:12:11:31 +0000"));
assertEquals(new Tuple<>('.', 6), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06 17:21:25.764368"));
assertEquals(new Tuple<>(',', 9), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764363438"));
assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764"));
assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764"));
assertEquals(new Tuple<>('.', 6), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06 17:21:25.764368Z"));
assertEquals(new Tuple<>(',', 9), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764363438Z"));
assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764Z"));
assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764Z"));
assertEquals(new Tuple<>('.', 6), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06 17:21:25.764368 Z"));
assertEquals(new Tuple<>(',', 9), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764363438 Z"));
assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764 Z"));
assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764 Z"));
}
}

View File

@ -0,0 +1,33 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
public class TsvLogStructureFinderFactoryTests extends LogStructureTestCase {
private LogStructureFinderFactory factory = new TsvLogStructureFinderFactory();
// No need to check JSON, XML or CSV because they come earlier in the order we check formats
public void testCanCreateFromSampleGivenTsv() {
assertTrue(factory.canCreateFromSample(explanation, TSV_SAMPLE));
}
public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
}
public void testCanCreateFromSampleGivenPipeSeparatedValues() {
assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
}
public void testCanCreateFromSampleGivenText() {
assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
}
}

View File

@ -0,0 +1,43 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
public class XmlLogStructureFinderFactoryTests extends LogStructureTestCase {
private LogStructureFinderFactory factory = new XmlLogStructureFinderFactory();
// No need to check JSON because it comes earlier in the order we check formats
public void testCanCreateFromSampleGivenXml() {
assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE));
}
public void testCanCreateFromSampleGivenCsv() {
assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE));
}
public void testCanCreateFromSampleGivenTsv() {
assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE));
}
public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
}
public void testCanCreateFromSampleGivenPipeSeparatedValues() {
assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
}
public void testCanCreateFromSampleGivenText() {
assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
}
}

View File

@ -0,0 +1,39 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;
import java.util.Collections;
public class XmlLogStructureFinderTests extends LogStructureTestCase {
private LogStructureFinderFactory factory = new XmlLogStructureFinderFactory();
public void testCreateConfigsGivenGoodXml() throws Exception {
assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE));
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
LogStructureFinder structureFinder = factory.createFromSample(explanation, XML_SAMPLE, charset, hasByteOrderMarker);
LogStructure structure = structureFinder.getStructure();
assertEquals(LogStructure.Format.XML, structure.getFormat());
assertEquals(charset, structure.getCharset());
if (hasByteOrderMarker == null) {
assertNull(structure.getHasByteOrderMarker());
} else {
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
}
assertNull(structure.getExcludeLinesPattern());
assertEquals("^\\s*<log4j:event", structure.getMultilineStartPattern());
assertNull(structure.getSeparator());
assertNull(structure.getHasHeaderRow());
assertNull(structure.getShouldTrimFields());
assertNull(structure.getGrokPattern());
assertEquals("timestamp", structure.getTimestampField());
assertEquals(Collections.singletonList("UNIX_MS"), structure.getTimestampFormats());
}
}