[ML] Add log structure finder functionality (#32788)
This change adds a library to ML that can be used to deduce a log file's structure given only a sample of the log file. Eventually this will be used to add an endpoint to ML to make the functionality available to end users, but this will follow in a separate change. The functionality is split into a library so that it can also be used by a command line tool without requiring the command line tool to include all server code.
This commit is contained in:
parent
986c55b830
commit
5ba04e23fc
|
@ -0,0 +1,36 @@
|
|||
import org.elasticsearch.gradle.precommit.PrecommitTasks
|
||||
|
||||
apply plugin: 'elasticsearch.build'
|
||||
|
||||
archivesBaseName = 'x-pack-log-structure-finder'
|
||||
|
||||
description = 'Common code for reverse engineering log structure'
|
||||
|
||||
dependencies {
|
||||
compile "org.elasticsearch:elasticsearch-core:${version}"
|
||||
compile "org.elasticsearch:elasticsearch-x-content:${version}"
|
||||
compile project(':libs:grok')
|
||||
compile "com.ibm.icu:icu4j:${versions.icu4j}"
|
||||
compile "net.sf.supercsv:super-csv:${versions.supercsv}"
|
||||
|
||||
testCompile "org.elasticsearch.test:framework:${version}"
|
||||
}
|
||||
|
||||
configurations {
|
||||
testArtifacts.extendsFrom testRuntime
|
||||
}
|
||||
task testJar(type: Jar) {
|
||||
appendix 'test'
|
||||
from sourceSets.test.output
|
||||
}
|
||||
artifacts {
|
||||
// normal es plugins do not publish the jar but we need to since users need it for Transport Clients and extensions
|
||||
archives jar
|
||||
testArtifacts testJar
|
||||
}
|
||||
|
||||
forbiddenApisMain {
|
||||
// log-structure-finder does not depend on server, so cannot forbid server methods
|
||||
signaturesURLs = [PrecommitTasks.getResource('/forbidden/jdk-signatures.txt')]
|
||||
}
|
||||
|
|
@ -0,0 +1 @@
|
|||
7a4d00d5ec5febd252a6182e8b6e87a0a9821f81
|
|
@ -0,0 +1,33 @@
|
|||
ICU License - ICU 1.8.1 and later
|
||||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright (c) 1995-2012 International Business Machines Corporation and others
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, and/or sell copies of the
|
||||
Software, and to permit persons to whom the Software is furnished to do so,
|
||||
provided that the above copyright notice(s) and this permission notice appear
|
||||
in all copies of the Software and that both the above copyright notice(s) and
|
||||
this permission notice appear in supporting documentation.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
|
||||
LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
|
||||
ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
|
||||
IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
|
||||
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
Except as contained in this notice, the name of a copyright holder shall not
|
||||
be used in advertising or otherwise to promote the sale, use or other
|
||||
dealings in this Software without prior written authorization of the
|
||||
copyright holder.
|
||||
|
||||
All trademarks and registered trademarks mentioned herein are the property of
|
||||
their respective owners.
|
|
@ -0,0 +1,3 @@
|
|||
ICU4J, (under lucene/analysis/icu) is licensed under an MIT style license
|
||||
(modules/analysis/icu/lib/icu4j-LICENSE-BSD_LIKE.txt) and Copyright (c) 1995-2012
|
||||
International Business Machines Corporation and others
|
|
@ -0,0 +1 @@
|
|||
017f8708c929029dde48bc298deaf3c7ae2452d3
|
|
@ -0,0 +1,203 @@
|
|||
/*
|
||||
* Apache License
|
||||
* Version 2.0, January 2004
|
||||
* http://www.apache.org/licenses/
|
||||
*
|
||||
* TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
*
|
||||
* 1. Definitions.
|
||||
*
|
||||
* "License" shall mean the terms and conditions for use, reproduction,
|
||||
* and distribution as defined by Sections 1 through 9 of this document.
|
||||
*
|
||||
* "Licensor" shall mean the copyright owner or entity authorized by
|
||||
* the copyright owner that is granting the License.
|
||||
*
|
||||
* "Legal Entity" shall mean the union of the acting entity and all
|
||||
* other entities that control, are controlled by, or are under common
|
||||
* control with that entity. For the purposes of this definition,
|
||||
* "control" means (i) the power, direct or indirect, to cause the
|
||||
* direction or management of such entity, whether by contract or
|
||||
* otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
* outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
*
|
||||
* "You" (or "Your") shall mean an individual or Legal Entity
|
||||
* exercising permissions granted by this License.
|
||||
*
|
||||
* "Source" form shall mean the preferred form for making modifications,
|
||||
* including but not limited to software source code, documentation
|
||||
* source, and configuration files.
|
||||
*
|
||||
* "Object" form shall mean any form resulting from mechanical
|
||||
* transformation or translation of a Source form, including but
|
||||
* not limited to compiled object code, generated documentation,
|
||||
* and conversions to other media types.
|
||||
*
|
||||
* "Work" shall mean the work of authorship, whether in Source or
|
||||
* Object form, made available under the License, as indicated by a
|
||||
* copyright notice that is included in or attached to the work
|
||||
* (an example is provided in the Appendix below).
|
||||
*
|
||||
* "Derivative Works" shall mean any work, whether in Source or Object
|
||||
* form, that is based on (or derived from) the Work and for which the
|
||||
* editorial revisions, annotations, elaborations, or other modifications
|
||||
* represent, as a whole, an original work of authorship. For the purposes
|
||||
* of this License, Derivative Works shall not include works that remain
|
||||
* separable from, or merely link (or bind by name) to the interfaces of,
|
||||
* the Work and Derivative Works thereof.
|
||||
*
|
||||
* "Contribution" shall mean any work of authorship, including
|
||||
* the original version of the Work and any modifications or additions
|
||||
* to that Work or Derivative Works thereof, that is intentionally
|
||||
* submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
* or by an individual or Legal Entity authorized to submit on behalf of
|
||||
* the copyright owner. For the purposes of this definition, "submitted"
|
||||
* means any form of electronic, verbal, or written communication sent
|
||||
* to the Licensor or its representatives, including but not limited to
|
||||
* communication on electronic mailing lists, source code control systems,
|
||||
* and issue tracking systems that are managed by, or on behalf of, the
|
||||
* Licensor for the purpose of discussing and improving the Work, but
|
||||
* excluding communication that is conspicuously marked or otherwise
|
||||
* designated in writing by the copyright owner as "Not a Contribution."
|
||||
*
|
||||
* "Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
* on behalf of whom a Contribution has been received by Licensor and
|
||||
* subsequently incorporated within the Work.
|
||||
*
|
||||
* 2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
* this License, each Contributor hereby grants to You a perpetual,
|
||||
* worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
* copyright license to reproduce, prepare Derivative Works of,
|
||||
* publicly display, publicly perform, sublicense, and distribute the
|
||||
* Work and such Derivative Works in Source or Object form.
|
||||
*
|
||||
* 3. Grant of Patent License. Subject to the terms and conditions of
|
||||
* this License, each Contributor hereby grants to You a perpetual,
|
||||
* worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
* (except as stated in this section) patent license to make, have made,
|
||||
* use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
* where such license applies only to those patent claims licensable
|
||||
* by such Contributor that are necessarily infringed by their
|
||||
* Contribution(s) alone or by combination of their Contribution(s)
|
||||
* with the Work to which such Contribution(s) was submitted. If You
|
||||
* institute patent litigation against any entity (including a
|
||||
* cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
* or a Contribution incorporated within the Work constitutes direct
|
||||
* or contributory patent infringement, then any patent licenses
|
||||
* granted to You under this License for that Work shall terminate
|
||||
* as of the date such litigation is filed.
|
||||
*
|
||||
* 4. Redistribution. You may reproduce and distribute copies of the
|
||||
* Work or Derivative Works thereof in any medium, with or without
|
||||
* modifications, and in Source or Object form, provided that You
|
||||
* meet the following conditions:
|
||||
*
|
||||
* (a) You must give any other recipients of the Work or
|
||||
* Derivative Works a copy of this License; and
|
||||
*
|
||||
* (b) You must cause any modified files to carry prominent notices
|
||||
* stating that You changed the files; and
|
||||
*
|
||||
* (c) You must retain, in the Source form of any Derivative Works
|
||||
* that You distribute, all copyright, patent, trademark, and
|
||||
* attribution notices from the Source form of the Work,
|
||||
* excluding those notices that do not pertain to any part of
|
||||
* the Derivative Works; and
|
||||
*
|
||||
* (d) If the Work includes a "NOTICE" text file as part of its
|
||||
* distribution, then any Derivative Works that You distribute must
|
||||
* include a readable copy of the attribution notices contained
|
||||
* within such NOTICE file, excluding those notices that do not
|
||||
* pertain to any part of the Derivative Works, in at least one
|
||||
* of the following places: within a NOTICE text file distributed
|
||||
* as part of the Derivative Works; within the Source form or
|
||||
* documentation, if provided along with the Derivative Works; or,
|
||||
* within a display generated by the Derivative Works, if and
|
||||
* wherever such third-party notices normally appear. The contents
|
||||
* of the NOTICE file are for informational purposes only and
|
||||
* do not modify the License. You may add Your own attribution
|
||||
* notices within Derivative Works that You distribute, alongside
|
||||
* or as an addendum to the NOTICE text from the Work, provided
|
||||
* that such additional attribution notices cannot be construed
|
||||
* as modifying the License.
|
||||
*
|
||||
* You may add Your own copyright statement to Your modifications and
|
||||
* may provide additional or different license terms and conditions
|
||||
* for use, reproduction, or distribution of Your modifications, or
|
||||
* for any such Derivative Works as a whole, provided Your use,
|
||||
* reproduction, and distribution of the Work otherwise complies with
|
||||
* the conditions stated in this License.
|
||||
*
|
||||
* 5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
* any Contribution intentionally submitted for inclusion in the Work
|
||||
* by You to the Licensor shall be under the terms and conditions of
|
||||
* this License, without any additional terms or conditions.
|
||||
* Notwithstanding the above, nothing herein shall supersede or modify
|
||||
* the terms of any separate license agreement you may have executed
|
||||
* with Licensor regarding such Contributions.
|
||||
*
|
||||
* 6. Trademarks. This License does not grant permission to use the trade
|
||||
* names, trademarks, service marks, or product names of the Licensor,
|
||||
* except as required for reasonable and customary use in describing the
|
||||
* origin of the Work and reproducing the content of the NOTICE file.
|
||||
*
|
||||
* 7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
* agreed to in writing, Licensor provides the Work (and each
|
||||
* Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
* implied, including, without limitation, any warranties or conditions
|
||||
* of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
* PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
* appropriateness of using or redistributing the Work and assume any
|
||||
* risks associated with Your exercise of permissions under this License.
|
||||
*
|
||||
* 8. Limitation of Liability. In no event and under no legal theory,
|
||||
* whether in tort (including negligence), contract, or otherwise,
|
||||
* unless required by applicable law (such as deliberate and grossly
|
||||
* negligent acts) or agreed to in writing, shall any Contributor be
|
||||
* liable to You for damages, including any direct, indirect, special,
|
||||
* incidental, or consequential damages of any character arising as a
|
||||
* result of this License or out of the use or inability to use the
|
||||
* Work (including but not limited to damages for loss of goodwill,
|
||||
* work stoppage, computer failure or malfunction, or any and all
|
||||
* other commercial damages or losses), even if such Contributor
|
||||
* has been advised of the possibility of such damages.
|
||||
*
|
||||
* 9. Accepting Warranty or Additional Liability. While redistributing
|
||||
* the Work or Derivative Works thereof, You may choose to offer,
|
||||
* and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
* or other liability obligations and/or rights consistent with this
|
||||
* License. However, in accepting such obligations, You may act only
|
||||
* on Your own behalf and on Your sole responsibility, not on behalf
|
||||
* of any other Contributor, and only if You agree to indemnify,
|
||||
* defend, and hold each Contributor harmless for any liability
|
||||
* incurred by, or claims asserted against, such Contributor by reason
|
||||
* of your accepting any such warranty or additional liability.
|
||||
*
|
||||
* END OF TERMS AND CONDITIONS
|
||||
*
|
||||
* APPENDIX: How to apply the Apache License to your work.
|
||||
*
|
||||
* To apply the Apache License to your work, attach the following
|
||||
* boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
* replaced with your own identifying information. (Don't include
|
||||
* the brackets!) The text should be enclosed in the appropriate
|
||||
* comment syntax for the file format. We also recommend that a
|
||||
* file or class name and description of purpose be included on the
|
||||
* same "printed page" as the copyright notice for easier
|
||||
* identification within third-party archives.
|
||||
*
|
||||
* Copyright 2007 Kasper B. Graversen
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
|
@ -0,0 +1,35 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.supercsv.prefs.CsvPreference;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public class CsvLogStructureFinderFactory implements LogStructureFinderFactory {
|
||||
|
||||
/**
|
||||
* Rules are:
|
||||
* - The file must be valid CSV
|
||||
* - It must contain at least two complete records
|
||||
* - There must be at least two fields per record (otherwise files with no commas could be treated as CSV!)
|
||||
* - Every CSV record except the last must have the same number of fields
|
||||
* The reason the last record is allowed to have fewer fields than the others is that
|
||||
* it could have been truncated when the file was sampled.
|
||||
*/
|
||||
@Override
|
||||
public boolean canCreateFromSample(List<String> explanation, String sample) {
|
||||
return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 2, CsvPreference.EXCEL_PREFERENCE, "CSV");
|
||||
}
|
||||
|
||||
@Override
|
||||
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
|
||||
throws IOException {
|
||||
return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
|
||||
CsvPreference.EXCEL_PREFERENCE, false);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,615 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.grok.Grok;
|
||||
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Creates Grok patterns that will match all provided sample messages.
|
||||
*
|
||||
* The choice of field names is quite primitive. The intention is that a human will edit these.
|
||||
*/
|
||||
public final class GrokPatternCreator {
|
||||
|
||||
private static final Map<Character, Boolean> PUNCTUATION_OR_SPACE_NEEDS_ESCAPING;
|
||||
static {
|
||||
HashMap<Character, Boolean> punctuationOrSpaceNeedsEscaping = new HashMap<>();
|
||||
String punctuationAndSpaceCharacters = "\"'`‘’“”#@%=\\/|~:;,<>()[]{}«»^$*¿?¡!§¶ \t\n";
|
||||
String punctuationThatNeedsEscaping = "\\|()[]{}^$*?";
|
||||
punctuationAndSpaceCharacters.chars()
|
||||
.forEach(c -> punctuationOrSpaceNeedsEscaping.put((char) c, punctuationThatNeedsEscaping.indexOf(c) >= 0));
|
||||
PUNCTUATION_OR_SPACE_NEEDS_ESCAPING = Collections.unmodifiableMap(punctuationOrSpaceNeedsEscaping);
|
||||
}
|
||||
|
||||
private static final String PREFACE = "preface";
|
||||
private static final String VALUE = "value";
|
||||
private static final String EPILOGUE = "epilogue";
|
||||
|
||||
/**
|
||||
* Grok patterns that are designed to match the whole message, not just a part of it.
|
||||
*/
|
||||
private static final List<FullMatchGrokPatternCandidate> FULL_MATCH_GROK_PATTERNS = Arrays.asList(
|
||||
new FullMatchGrokPatternCandidate("BACULA_LOGLINE", "bts"),
|
||||
new FullMatchGrokPatternCandidate("CATALINALOG", "timestamp"),
|
||||
new FullMatchGrokPatternCandidate("COMBINEDAPACHELOG", "timestamp"),
|
||||
new FullMatchGrokPatternCandidate("COMMONAPACHELOG", "timestamp"),
|
||||
new FullMatchGrokPatternCandidate("ELB_ACCESS_LOG", "timestamp"),
|
||||
new FullMatchGrokPatternCandidate("HAPROXYHTTP", "syslog_timestamp"),
|
||||
new FullMatchGrokPatternCandidate("HAPROXYTCP", "syslog_timestamp"),
|
||||
new FullMatchGrokPatternCandidate("HTTPD20_ERRORLOG", "timestamp"),
|
||||
new FullMatchGrokPatternCandidate("HTTPD24_ERRORLOG", "timestamp"),
|
||||
new FullMatchGrokPatternCandidate("NAGIOSLOGLINE", "nagios_epoch"),
|
||||
new FullMatchGrokPatternCandidate("NETSCREENSESSIONLOG", "date"),
|
||||
new FullMatchGrokPatternCandidate("RAILS3", "timestamp"),
|
||||
new FullMatchGrokPatternCandidate("RUBY_LOGGER", "timestamp"),
|
||||
new FullMatchGrokPatternCandidate("SHOREWALL", "timestamp"),
|
||||
new FullMatchGrokPatternCandidate("TOMCATLOG", "timestamp")
|
||||
);
|
||||
|
||||
/**
|
||||
* The first match in this list will be chosen, so it needs to be ordered
|
||||
* such that more generic patterns come after more specific patterns.
|
||||
*/
|
||||
private static final List<GrokPatternCandidate> ORDERED_CANDIDATE_GROK_PATTERNS = Arrays.asList(
|
||||
new ValueOnlyGrokPatternCandidate("TOMCAT_DATESTAMP", "date", "extra_timestamp"),
|
||||
new ValueOnlyGrokPatternCandidate("TIMESTAMP_ISO8601", "date", "extra_timestamp"),
|
||||
new ValueOnlyGrokPatternCandidate("DATESTAMP_RFC822", "date", "extra_timestamp"),
|
||||
new ValueOnlyGrokPatternCandidate("DATESTAMP_RFC2822", "date", "extra_timestamp"),
|
||||
new ValueOnlyGrokPatternCandidate("DATESTAMP_OTHER", "date", "extra_timestamp"),
|
||||
new ValueOnlyGrokPatternCandidate("DATESTAMP_EVENTLOG", "date", "extra_timestamp"),
|
||||
new ValueOnlyGrokPatternCandidate("SYSLOGTIMESTAMP", "date", "extra_timestamp"),
|
||||
new ValueOnlyGrokPatternCandidate("HTTPDATE", "date", "extra_timestamp"),
|
||||
new ValueOnlyGrokPatternCandidate("CATALINA_DATESTAMP", "date", "extra_timestamp"),
|
||||
new ValueOnlyGrokPatternCandidate("CISCOTIMESTAMP", "date", "extra_timestamp"),
|
||||
new ValueOnlyGrokPatternCandidate("LOGLEVEL", "keyword", "loglevel"),
|
||||
new ValueOnlyGrokPatternCandidate("URI", "keyword", "uri"),
|
||||
new ValueOnlyGrokPatternCandidate("UUID", "keyword", "uuid"),
|
||||
new ValueOnlyGrokPatternCandidate("MAC", "keyword", "macaddress"),
|
||||
// Can't use \b as the breaks, because slashes are not "word" characters
|
||||
new ValueOnlyGrokPatternCandidate("PATH", "keyword", "path", "(?<!\\w)", "(?!\\w)"),
|
||||
new ValueOnlyGrokPatternCandidate("EMAILADDRESS", "keyword", "email"),
|
||||
// TODO: would be nice to have IPORHOST here, but HOST matches almost all words
|
||||
new ValueOnlyGrokPatternCandidate("IP", "ip", "ipaddress"),
|
||||
new ValueOnlyGrokPatternCandidate("DATE", "date", "date"),
|
||||
new ValueOnlyGrokPatternCandidate("TIME", "date", "time"),
|
||||
// This already includes pre/post break conditions
|
||||
new ValueOnlyGrokPatternCandidate("QUOTEDSTRING", "keyword", "field", "", ""),
|
||||
// Disallow +, - and . before numbers, as well as "word" characters, otherwise we'll pick
|
||||
// up numeric suffices too eagerly
|
||||
new ValueOnlyGrokPatternCandidate("INT", "long", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\d)"),
|
||||
new ValueOnlyGrokPatternCandidate("NUMBER", "double", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\d)"),
|
||||
new ValueOnlyGrokPatternCandidate("BASE16NUM", "keyword", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\w)")
|
||||
// TODO: also unfortunately can't have USERNAME in the list as it matches too broadly
|
||||
// Fixing these problems with overly broad matches would require some extra intelligence
|
||||
// to be added to remove inappropriate matches. One idea would be to use a dictionary,
|
||||
// but that doesn't necessarily help as "jay" could be a username but is also a dictionary
|
||||
// word (plus there's the international headache with relying on dictionaries). Similarly,
|
||||
// hostnames could also be dictionary words - I've worked on machines called "hippo" and
|
||||
// "scarf" in the past. Another idea would be to look at the adjacent characters and
|
||||
// apply some heuristic based on those.
|
||||
);
|
||||
|
||||
/**
|
||||
* It is expected that the explanation will be shared with other code.
|
||||
* Both this class and other classes will update it.
|
||||
*/
|
||||
private final List<String> explanation;
|
||||
private final Collection<String> sampleMessages;
|
||||
|
||||
/**
|
||||
* It is expected that the mappings will be shared with other code.
|
||||
* Both this class and other classes will update it.
|
||||
*/
|
||||
private final Map<String, Object> mappings;
|
||||
private final Map<String, Integer> fieldNameCountStore = new HashMap<>();
|
||||
private final StringBuilder overallGrokPatternBuilder = new StringBuilder();
|
||||
|
||||
/**
|
||||
*
|
||||
* @param explanation List of reasons for making decisions. May contain items when passed and new reasons
|
||||
* can be appended by the methods of this class.
|
||||
* @param sampleMessages Sample messages that any Grok pattern found must match.
|
||||
* @param mappings Will be updated with mappings appropriate for the returned pattern, if non-<code>null</code>.
|
||||
*/
|
||||
public GrokPatternCreator(List<String> explanation, Collection<String> sampleMessages, Map<String, Object> mappings) {
|
||||
this.explanation = explanation;
|
||||
this.sampleMessages = Collections.unmodifiableCollection(sampleMessages);
|
||||
this.mappings = mappings;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method attempts to find a Grok pattern that will match all of the sample messages in their entirety.
|
||||
* @return A tuple of (time field name, Grok string), or <code>null</code> if no suitable Grok pattern was found.
|
||||
*/
|
||||
public Tuple<String, String> findFullLineGrokPattern() {
|
||||
|
||||
for (FullMatchGrokPatternCandidate candidate : FULL_MATCH_GROK_PATTERNS) {
|
||||
if (candidate.matchesAll(sampleMessages)) {
|
||||
return candidate.processMatch(explanation, sampleMessages, mappings);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a Grok pattern that will match all of the sample messages in their entirety.
|
||||
* @param seedPatternName A pattern that has already been determined to match some portion of every sample message.
|
||||
* @param seedFieldName The field name to be used for the portion of every sample message that the seed pattern matches.
|
||||
* @return The built Grok pattern.
|
||||
*/
|
||||
public String createGrokPatternFromExamples(String seedPatternName, String seedFieldName) {
|
||||
|
||||
overallGrokPatternBuilder.setLength(0);
|
||||
|
||||
GrokPatternCandidate seedCandidate = new NoMappingGrokPatternCandidate(seedPatternName, seedFieldName);
|
||||
|
||||
processCandidateAndSplit(seedCandidate, true, sampleMessages, false, 0, false, 0);
|
||||
|
||||
return overallGrokPatternBuilder.toString().replace("\t", "\\t").replace("\n", "\\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* This is purely to allow unit tests to inspect the partial Grok pattern after testing implementation details.
|
||||
* It should not be used in production code.
|
||||
*/
|
||||
StringBuilder getOverallGrokPatternBuilder() {
|
||||
return overallGrokPatternBuilder;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a chosen Grok pattern and a collection of message snippets, split the snippets into the
|
||||
* matched section and the pieces before and after it. Recurse to find more matches in the pieces
|
||||
* before and after and update the supplied string builder.
|
||||
*/
|
||||
private void processCandidateAndSplit(GrokPatternCandidate chosenPattern, boolean isLast, Collection<String> snippets,
|
||||
boolean ignoreKeyValueCandidateLeft, int ignoreValueOnlyCandidatesLeft,
|
||||
boolean ignoreKeyValueCandidateRight, int ignoreValueOnlyCandidatesRight) {
|
||||
|
||||
Collection<String> prefaces = new ArrayList<>();
|
||||
Collection<String> epilogues = new ArrayList<>();
|
||||
String patternBuilderContent = chosenPattern.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, mappings);
|
||||
appendBestGrokMatchForStrings(false, prefaces, ignoreKeyValueCandidateLeft, ignoreValueOnlyCandidatesLeft);
|
||||
overallGrokPatternBuilder.append(patternBuilderContent);
|
||||
appendBestGrokMatchForStrings(isLast, epilogues, ignoreKeyValueCandidateRight, ignoreValueOnlyCandidatesRight);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a collection of message snippets, work out which (if any) of the Grok patterns we're allowed
|
||||
* to use matches it best. Then append the appropriate Grok language to represent that finding onto
|
||||
* the supplied string builder.
|
||||
*/
|
||||
void appendBestGrokMatchForStrings(boolean isLast, Collection<String> snippets,
|
||||
boolean ignoreKeyValueCandidate, int ignoreValueOnlyCandidates) {
|
||||
|
||||
snippets = adjustForPunctuation(snippets);
|
||||
|
||||
GrokPatternCandidate bestCandidate = null;
|
||||
if (snippets.isEmpty() == false) {
|
||||
GrokPatternCandidate kvCandidate = new KeyValueGrokPatternCandidate(explanation);
|
||||
if (ignoreKeyValueCandidate == false && kvCandidate.matchesAll(snippets)) {
|
||||
bestCandidate = kvCandidate;
|
||||
} else {
|
||||
ignoreKeyValueCandidate = true;
|
||||
for (GrokPatternCandidate candidate :
|
||||
ORDERED_CANDIDATE_GROK_PATTERNS.subList(ignoreValueOnlyCandidates, ORDERED_CANDIDATE_GROK_PATTERNS.size())) {
|
||||
if (candidate.matchesAll(snippets)) {
|
||||
bestCandidate = candidate;
|
||||
break;
|
||||
}
|
||||
++ignoreValueOnlyCandidates;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (bestCandidate == null) {
|
||||
if (isLast) {
|
||||
finalizeGrokPattern(snippets);
|
||||
} else {
|
||||
addIntermediateRegex(snippets);
|
||||
}
|
||||
} else {
|
||||
processCandidateAndSplit(bestCandidate, isLast, snippets, true, ignoreValueOnlyCandidates + (ignoreKeyValueCandidate ? 1 : 0),
|
||||
ignoreKeyValueCandidate, ignoreValueOnlyCandidates);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* If the snippets supplied begin with more than 1 character of common punctuation or whitespace
|
||||
* then add all but the last of these characters to the overall pattern and remove them from the
|
||||
* snippets.
|
||||
* @param snippets Input snippets - not modified.
|
||||
* @return Output snippets, which will be a copy of the input snippets but with whatever characters
|
||||
* were added to <code>overallPatternBuilder</code> removed from the beginning.
|
||||
*/
|
||||
Collection<String> adjustForPunctuation(Collection<String> snippets) {
|
||||
|
||||
assert snippets.isEmpty() == false;
|
||||
|
||||
StringBuilder commonInitialPunctuation = new StringBuilder();
|
||||
|
||||
for (String snippet : snippets) {
|
||||
|
||||
if (commonInitialPunctuation.length() == 0) {
|
||||
for (int index = 0; index < snippet.length(); ++index) {
|
||||
char ch = snippet.charAt(index);
|
||||
if (PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch) != null) {
|
||||
commonInitialPunctuation.append(ch);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (commonInitialPunctuation.length() > snippet.length()) {
|
||||
commonInitialPunctuation.delete(snippet.length(), commonInitialPunctuation.length());
|
||||
}
|
||||
for (int index = 0; index < commonInitialPunctuation.length(); ++index) {
|
||||
char ch = snippet.charAt(index);
|
||||
if (ch != commonInitialPunctuation.charAt(index)) {
|
||||
commonInitialPunctuation.delete(index, commonInitialPunctuation.length());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (commonInitialPunctuation.length() <= 1) {
|
||||
return snippets;
|
||||
}
|
||||
}
|
||||
|
||||
int numLiteralCharacters = commonInitialPunctuation.length() - 1;
|
||||
|
||||
for (int index = 0; index < numLiteralCharacters; ++index) {
|
||||
char ch = commonInitialPunctuation.charAt(index);
|
||||
if (PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.getOrDefault(ch, false)) {
|
||||
overallGrokPatternBuilder.append('\\');
|
||||
}
|
||||
overallGrokPatternBuilder.append(ch);
|
||||
}
|
||||
|
||||
return snippets.stream().map(snippet -> snippet.substring(numLiteralCharacters)).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
/**
|
||||
* The first time a particular field name is passed, simply return it.
|
||||
* The second time return it with "2" appended.
|
||||
* The third time return it with "3" appended.
|
||||
* Etc.
|
||||
*/
|
||||
static String buildFieldName(Map<String, Integer> fieldNameCountStore, String fieldName) {
|
||||
Integer numberSeen = fieldNameCountStore.compute(fieldName, (k, v) -> 1 + ((v == null) ? 0 : v));
|
||||
return (numberSeen > 1) ? fieldName + numberSeen : fieldName;
|
||||
}
|
||||
|
||||
private void addIntermediateRegex(Collection<String> snippets) {
|
||||
addIntermediateRegex(overallGrokPatternBuilder, snippets);
|
||||
}
|
||||
|
||||
public static void addIntermediateRegex(StringBuilder patternBuilder, Collection<String> snippets) {
|
||||
if (snippets.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
List<String> others = new ArrayList<>(snippets);
|
||||
String driver = others.remove(others.size() - 1);
|
||||
|
||||
boolean wildcardRequiredIfNonMatchFound = true;
|
||||
for (int i = 0; i < driver.length(); ++i) {
|
||||
char ch = driver.charAt(i);
|
||||
Boolean punctuationOrSpaceNeedsEscaping = PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch);
|
||||
if (punctuationOrSpaceNeedsEscaping != null && others.stream().allMatch(other -> other.indexOf(ch) >= 0)) {
|
||||
if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(other -> other.indexOf(ch) > 0)) {
|
||||
patternBuilder.append(".*?");
|
||||
}
|
||||
if (punctuationOrSpaceNeedsEscaping) {
|
||||
patternBuilder.append('\\');
|
||||
}
|
||||
patternBuilder.append(ch);
|
||||
wildcardRequiredIfNonMatchFound = true;
|
||||
others = others.stream().map(other -> other.substring(other.indexOf(ch) + 1)).collect(Collectors.toList());
|
||||
} else if (wildcardRequiredIfNonMatchFound) {
|
||||
patternBuilder.append(".*?");
|
||||
wildcardRequiredIfNonMatchFound = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(s -> s.isEmpty() == false)) {
|
||||
patternBuilder.append(".*?");
|
||||
}
|
||||
}
|
||||
|
||||
private void finalizeGrokPattern(Collection<String> snippets) {
|
||||
if (snippets.stream().allMatch(String::isEmpty)) {
|
||||
return;
|
||||
}
|
||||
|
||||
List<String> others = new ArrayList<>(snippets);
|
||||
String driver = others.remove(others.size() - 1);
|
||||
|
||||
for (int i = 0; i < driver.length(); ++i) {
|
||||
char ch = driver.charAt(i);
|
||||
int driverIndex = i;
|
||||
Boolean punctuationOrSpaceNeedsEscaping = PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch);
|
||||
if (punctuationOrSpaceNeedsEscaping != null &&
|
||||
others.stream().allMatch(other -> other.length() > driverIndex && other.charAt(driverIndex) == ch)) {
|
||||
if (punctuationOrSpaceNeedsEscaping) {
|
||||
overallGrokPatternBuilder.append('\\');
|
||||
}
|
||||
overallGrokPatternBuilder.append(ch);
|
||||
if (i == driver.length() - 1 && others.stream().allMatch(driver::equals)) {
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
overallGrokPatternBuilder.append(".*");
|
||||
}
|
||||
|
||||
interface GrokPatternCandidate {
|
||||
|
||||
/**
|
||||
* @return Does this Grok pattern candidate match all the snippets?
|
||||
*/
|
||||
boolean matchesAll(Collection<String> snippets);
|
||||
|
||||
/**
|
||||
* After it has been determined that this Grok pattern candidate matches a collection of strings,
|
||||
* return collections of the bits that come before (prefaces) and after (epilogues) the bit
|
||||
* that matches. Also update mappings with the most appropriate field name and type.
|
||||
* @return The string that needs to be incorporated into the overall Grok pattern for the line.
|
||||
*/
|
||||
String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
|
||||
Collection<String> epilogues, Map<String, Object> mappings);
|
||||
}
|
||||
|
||||
/**
|
||||
* A Grok pattern candidate that will match a single named Grok pattern.
|
||||
*/
|
||||
static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate {
|
||||
|
||||
private final String grokPatternName;
|
||||
private final String mappingType;
|
||||
private final String fieldName;
|
||||
private final Grok grok;
|
||||
|
||||
/**
|
||||
* Pre/post breaks default to \b, but this may not be appropriate for Grok patterns that start or
|
||||
* end with a non "word" character (i.e. letter, number or underscore). For such patterns use one
|
||||
* of the other constructors.
|
||||
* <p>
|
||||
* In cases where the Grok pattern defined by Logstash already includes conditions on what must
|
||||
* come before and after the match, use one of the other constructors and specify an empty string
|
||||
* for the pre and/or post breaks.
|
||||
*
|
||||
* @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
|
||||
* @param fieldName Name of the field to extract from the match.
|
||||
*/
|
||||
ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName) {
|
||||
this(grokPatternName, mappingType, fieldName, "\\b", "\\b");
|
||||
}
|
||||
|
||||
/**
|
||||
* @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
|
||||
* @param mappingType Data type for field in Elasticsearch mappings.
|
||||
* @param fieldName Name of the field to extract from the match.
|
||||
* @param preBreak Only consider the match if it's broken from the previous text by this.
|
||||
* @param postBreak Only consider the match if it's broken from the following text by this.
|
||||
*/
|
||||
ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName, String preBreak, String postBreak) {
|
||||
this.grokPatternName = grokPatternName;
|
||||
this.mappingType = mappingType;
|
||||
this.fieldName = fieldName;
|
||||
// The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java
|
||||
grok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + preBreak +
|
||||
"%{" + grokPatternName + ":" + VALUE + "}" + postBreak + "%{GREEDYDATA:" + EPILOGUE + "}");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matchesAll(Collection<String> snippets) {
|
||||
return snippets.stream().allMatch(grok::match);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a collection of strings, and a Grok pattern that matches some part of them all,
|
||||
* return collections of the bits that come before (prefaces) and after (epilogues) the
|
||||
* bit that matches.
|
||||
*/
|
||||
@Override
|
||||
public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
|
||||
Collection<String> epilogues, Map<String, Object> mappings) {
|
||||
String sampleValue = null;
|
||||
for (String snippet : snippets) {
|
||||
Map<String, Object> captures = grok.captures(snippet);
|
||||
// If the pattern doesn't match then captures will be null
|
||||
if (captures == null) {
|
||||
throw new IllegalStateException("[%{" + grokPatternName + "}] does not match snippet [" + snippet + "]");
|
||||
}
|
||||
prefaces.add(captures.getOrDefault(PREFACE, "").toString());
|
||||
if (sampleValue == null) {
|
||||
sampleValue = captures.get(VALUE).toString();
|
||||
}
|
||||
epilogues.add(captures.getOrDefault(EPILOGUE, "").toString());
|
||||
}
|
||||
String adjustedFieldName = buildFieldName(fieldNameCountStore, fieldName);
|
||||
if (mappings != null) {
|
||||
Map<String, String> fullMappingType = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, mappingType);
|
||||
if ("date".equals(mappingType)) {
|
||||
TimestampMatch timestampMatch = TimestampFormatFinder.findFirstFullMatch(sampleValue);
|
||||
if (timestampMatch != null) {
|
||||
fullMappingType = timestampMatch.getEsDateMappingTypeWithFormat();
|
||||
}
|
||||
}
|
||||
mappings.put(adjustedFieldName, fullMappingType);
|
||||
}
|
||||
return "%{" + grokPatternName + ":" + adjustedFieldName + "}";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Unlike the {@link ValueOnlyGrokPatternCandidate} an object of this class is not immutable and not thread safe.
|
||||
* When a given object matches a set of strings it chooses a field name. Then that same field name is used when
|
||||
* processing captures from the pattern. Hence only a single thread may use any particular instance of this
|
||||
* class.
|
||||
*/
|
||||
static class KeyValueGrokPatternCandidate implements GrokPatternCandidate {
|
||||
|
||||
private static final Pattern kvFinder = Pattern.compile("\\b(\\w+)=[\\w.-]+");
|
||||
private final List<String> explanation;
|
||||
private String fieldName;
|
||||
|
||||
KeyValueGrokPatternCandidate(List<String> explanation) {
|
||||
this.explanation = explanation;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matchesAll(Collection<String> snippets) {
|
||||
Set<String> candidateNames = new LinkedHashSet<>();
|
||||
boolean isFirst = true;
|
||||
for (String snippet : snippets) {
|
||||
if (isFirst) {
|
||||
Matcher matcher = kvFinder.matcher(snippet);
|
||||
while (matcher.find()) {
|
||||
candidateNames.add(matcher.group(1));
|
||||
}
|
||||
isFirst = false;
|
||||
} else {
|
||||
candidateNames.removeIf(candidateName ->
|
||||
Pattern.compile("\\b" + candidateName + "=[\\w.-]+").matcher(snippet).find() == false);
|
||||
}
|
||||
if (candidateNames.isEmpty()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return (fieldName = candidateNames.stream().findFirst().orElse(null)) != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
|
||||
Collection<String> epilogues, Map<String, Object> mappings) {
|
||||
if (fieldName == null) {
|
||||
throw new IllegalStateException("Cannot process KV matches until a field name has been determined");
|
||||
}
|
||||
Grok grok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}\\b" +
|
||||
fieldName + "=%{USER:" + VALUE + "}%{GREEDYDATA:" + EPILOGUE + "}");
|
||||
Collection<String> values = new ArrayList<>();
|
||||
for (String snippet : snippets) {
|
||||
Map<String, Object> captures = grok.captures(snippet);
|
||||
// If the pattern doesn't match then captures will be null
|
||||
if (captures == null) {
|
||||
throw new IllegalStateException("[\\b" + fieldName + "=%{USER}] does not match snippet [" + snippet + "]");
|
||||
}
|
||||
prefaces.add(captures.getOrDefault(PREFACE, "").toString());
|
||||
values.add(captures.getOrDefault(VALUE, "").toString());
|
||||
epilogues.add(captures.getOrDefault(EPILOGUE, "").toString());
|
||||
}
|
||||
String adjustedFieldName = buildFieldName(fieldNameCountStore, fieldName);
|
||||
if (mappings != null) {
|
||||
mappings.put(adjustedFieldName, LogStructureUtils.guessScalarMapping(explanation, adjustedFieldName, values));
|
||||
}
|
||||
return "\\b" + fieldName + "=%{USER:" + adjustedFieldName + "}";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A Grok pattern candidate that matches a single named Grok pattern but will not update mappings.
|
||||
*/
|
||||
static class NoMappingGrokPatternCandidate extends ValueOnlyGrokPatternCandidate {
|
||||
|
||||
NoMappingGrokPatternCandidate(String grokPatternName, String fieldName) {
|
||||
super(grokPatternName, null, fieldName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
|
||||
Collection<String> epilogues, Map<String, Object> mappings) {
|
||||
return super.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, null);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Used to check whether a single Grok pattern matches every sample message in its entirety.
|
||||
*/
|
||||
static class FullMatchGrokPatternCandidate {
|
||||
|
||||
private final String grokString;
|
||||
private final String timeField;
|
||||
private final Grok grok;
|
||||
|
||||
FullMatchGrokPatternCandidate(String grokPatternName, String timeField) {
|
||||
grokString = "%{" + grokPatternName + "}";
|
||||
this.timeField = timeField;
|
||||
grok = new Grok(Grok.getBuiltinPatterns(), grokString);
|
||||
}
|
||||
|
||||
public boolean matchesAll(Collection<String> sampleMessages) {
|
||||
return sampleMessages.stream().allMatch(grok::match);
|
||||
}
|
||||
|
||||
/**
|
||||
* This must only be called if {@link #matchesAll} returns <code>true</code>.
|
||||
* @return A tuple of (time field name, Grok string).
|
||||
*/
|
||||
public Tuple<String, String> processMatch(List<String> explanation, Collection<String> sampleMessages,
|
||||
Map<String, Object> mappings) {
|
||||
|
||||
explanation.add("A full message Grok pattern [" + grokString.substring(2, grokString.length() - 1) + "] looks appropriate");
|
||||
|
||||
if (mappings != null) {
|
||||
Map<String, Collection<String>> valuesPerField = new HashMap<>();
|
||||
|
||||
for (String sampleMessage : sampleMessages) {
|
||||
Map<String, Object> captures = grok.captures(sampleMessage);
|
||||
// If the pattern doesn't match then captures will be null
|
||||
if (captures == null) {
|
||||
throw new IllegalStateException("[" + grokString + "] does not match snippet [" + sampleMessage + "]");
|
||||
}
|
||||
for (Map.Entry<String, Object> capture : captures.entrySet()) {
|
||||
|
||||
String fieldName = capture.getKey();
|
||||
String fieldValue = capture.getValue().toString();
|
||||
|
||||
// Exclude the time field because that will be dropped and replaced with @timestamp
|
||||
if (fieldName.equals(timeField) == false) {
|
||||
valuesPerField.compute(fieldName, (k, v) -> {
|
||||
if (v == null) {
|
||||
return new ArrayList<>(Collections.singletonList(fieldValue));
|
||||
} else {
|
||||
v.add(fieldValue);
|
||||
return v;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (Map.Entry<String, Collection<String>> valuesForField : valuesPerField.entrySet()) {
|
||||
String fieldName = valuesForField.getKey();
|
||||
mappings.put(fieldName,
|
||||
LogStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue()));
|
||||
}
|
||||
}
|
||||
|
||||
return new Tuple<>(timeField, grokString);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.common.xcontent.DeprecationHandler;
|
||||
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.SortedMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
|
||||
|
||||
/**
|
||||
* Really ND-JSON.
|
||||
*/
|
||||
public class JsonLogStructureFinder implements LogStructureFinder {
|
||||
|
||||
private final List<String> sampleMessages;
|
||||
private final LogStructure structure;
|
||||
|
||||
static JsonLogStructureFinder makeJsonLogStructureFinder(List<String> explanation, String sample, String charsetName,
|
||||
Boolean hasByteOrderMarker) throws IOException {
|
||||
|
||||
List<Map<String, ?>> sampleRecords = new ArrayList<>();
|
||||
|
||||
List<String> sampleMessages = Arrays.asList(sample.split("\n"));
|
||||
for (String sampleMessage : sampleMessages) {
|
||||
XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY, DeprecationHandler.THROW_UNSUPPORTED_OPERATION,
|
||||
sampleMessage);
|
||||
sampleRecords.add(parser.mapOrdered());
|
||||
}
|
||||
|
||||
LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.JSON)
|
||||
.setCharset(charsetName)
|
||||
.setHasByteOrderMarker(hasByteOrderMarker)
|
||||
.setSampleStart(sampleMessages.stream().limit(2).collect(Collectors.joining("\n", "", "\n")))
|
||||
.setNumLinesAnalyzed(sampleMessages.size())
|
||||
.setNumMessagesAnalyzed(sampleRecords.size());
|
||||
|
||||
Tuple<String, TimestampMatch> timeField = LogStructureUtils.guessTimestampField(explanation, sampleRecords);
|
||||
if (timeField != null) {
|
||||
structureBuilder.setTimestampField(timeField.v1())
|
||||
.setTimestampFormats(timeField.v2().dateFormats)
|
||||
.setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing());
|
||||
}
|
||||
|
||||
SortedMap<String, Object> mappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
|
||||
mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
|
||||
|
||||
LogStructure structure = structureBuilder
|
||||
.setMappings(mappings)
|
||||
.setExplanation(explanation)
|
||||
.build();
|
||||
|
||||
return new JsonLogStructureFinder(sampleMessages, structure);
|
||||
}
|
||||
|
||||
private JsonLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
|
||||
this.sampleMessages = Collections.unmodifiableList(sampleMessages);
|
||||
this.structure = structure;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getSampleMessages() {
|
||||
return sampleMessages;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LogStructure getStructure() {
|
||||
return structure;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.xcontent.DeprecationHandler;
|
||||
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
|
||||
|
||||
public class JsonLogStructureFinderFactory implements LogStructureFinderFactory {
|
||||
|
||||
/**
|
||||
* This format matches if the sample consists of one or more JSON documents.
|
||||
* If there is more than one, they must be newline-delimited. The
|
||||
* documents must be non-empty, to prevent lines containing "{}" from matching.
|
||||
*/
|
||||
@Override
|
||||
public boolean canCreateFromSample(List<String> explanation, String sample) {
|
||||
|
||||
int completeDocCount = 0;
|
||||
|
||||
try {
|
||||
String[] sampleLines = sample.split("\n");
|
||||
for (String sampleLine : sampleLines) {
|
||||
try (XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY,
|
||||
DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader(sampleLine))) {
|
||||
|
||||
if (parser.map().isEmpty()) {
|
||||
explanation.add("Not JSON because an empty object was parsed: [" + sampleLine + "]");
|
||||
return false;
|
||||
}
|
||||
++completeDocCount;
|
||||
if (parser.nextToken() != null) {
|
||||
explanation.add("Not newline delimited JSON because a line contained more than a single object: [" +
|
||||
sampleLine + "]");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException | IllegalStateException e) {
|
||||
explanation.add("Not JSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (completeDocCount == 0) {
|
||||
explanation.add("Not JSON because sample didn't contain a complete document");
|
||||
return false;
|
||||
}
|
||||
|
||||
explanation.add("Deciding sample is newline delimited JSON");
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
|
||||
throws IOException {
|
||||
return JsonLogStructureFinder.makeJsonLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker);
|
||||
}
|
||||
|
||||
private static class ContextPrintingStringReader extends StringReader {
|
||||
|
||||
private final String str;
|
||||
|
||||
ContextPrintingStringReader(String str) {
|
||||
super(str);
|
||||
this.str = str;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
if (str.length() <= 80) {
|
||||
return String.format(Locale.ROOT, "\"%s\"", str);
|
||||
} else {
|
||||
return String.format(Locale.ROOT, "\"%.77s...\"", str);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,614 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.xcontent.ObjectParser;
|
||||
import org.elasticsearch.common.xcontent.ToXContentObject;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* Stores the log file format determined by a {@link LogStructureFinder}.
|
||||
*/
|
||||
public class LogStructure implements ToXContentObject {
|
||||
|
||||
public enum Format {
|
||||
|
||||
JSON, XML, CSV, TSV, SEMI_COLON_SEPARATED_VALUES, PIPE_SEPARATED_VALUES, SEMI_STRUCTURED_TEXT;
|
||||
|
||||
public Character separator() {
|
||||
switch (this) {
|
||||
case JSON:
|
||||
case XML:
|
||||
return null;
|
||||
case CSV:
|
||||
return ',';
|
||||
case TSV:
|
||||
return '\t';
|
||||
case SEMI_COLON_SEPARATED_VALUES:
|
||||
return ';';
|
||||
case PIPE_SEPARATED_VALUES:
|
||||
return '|';
|
||||
case SEMI_STRUCTURED_TEXT:
|
||||
return null;
|
||||
default:
|
||||
throw new IllegalStateException("enum value [" + this + "] missing from switch.");
|
||||
}
|
||||
}
|
||||
|
||||
public boolean supportsNesting() {
|
||||
switch (this) {
|
||||
case JSON:
|
||||
case XML:
|
||||
return true;
|
||||
case CSV:
|
||||
case TSV:
|
||||
case SEMI_COLON_SEPARATED_VALUES:
|
||||
case PIPE_SEPARATED_VALUES:
|
||||
case SEMI_STRUCTURED_TEXT:
|
||||
return false;
|
||||
default:
|
||||
throw new IllegalStateException("enum value [" + this + "] missing from switch.");
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isStructured() {
|
||||
switch (this) {
|
||||
case JSON:
|
||||
case XML:
|
||||
case CSV:
|
||||
case TSV:
|
||||
case SEMI_COLON_SEPARATED_VALUES:
|
||||
case PIPE_SEPARATED_VALUES:
|
||||
return true;
|
||||
case SEMI_STRUCTURED_TEXT:
|
||||
return false;
|
||||
default:
|
||||
throw new IllegalStateException("enum value [" + this + "] missing from switch.");
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isSemiStructured() {
|
||||
switch (this) {
|
||||
case JSON:
|
||||
case XML:
|
||||
case CSV:
|
||||
case TSV:
|
||||
case SEMI_COLON_SEPARATED_VALUES:
|
||||
case PIPE_SEPARATED_VALUES:
|
||||
return false;
|
||||
case SEMI_STRUCTURED_TEXT:
|
||||
return true;
|
||||
default:
|
||||
throw new IllegalStateException("enum value [" + this + "] missing from switch.");
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isSeparatedValues() {
|
||||
switch (this) {
|
||||
case JSON:
|
||||
case XML:
|
||||
return false;
|
||||
case CSV:
|
||||
case TSV:
|
||||
case SEMI_COLON_SEPARATED_VALUES:
|
||||
case PIPE_SEPARATED_VALUES:
|
||||
return true;
|
||||
case SEMI_STRUCTURED_TEXT:
|
||||
return false;
|
||||
default:
|
||||
throw new IllegalStateException("enum value [" + this + "] missing from switch.");
|
||||
}
|
||||
}
|
||||
|
||||
public static Format fromSeparator(char separator) {
|
||||
switch (separator) {
|
||||
case ',':
|
||||
return CSV;
|
||||
case '\t':
|
||||
return TSV;
|
||||
case ';':
|
||||
return SEMI_COLON_SEPARATED_VALUES;
|
||||
case '|':
|
||||
return PIPE_SEPARATED_VALUES;
|
||||
default:
|
||||
throw new IllegalArgumentException("No known format has separator [" + separator + "]");
|
||||
}
|
||||
}
|
||||
|
||||
public static Format fromString(String name) {
|
||||
return valueOf(name.trim().toUpperCase(Locale.ROOT));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return name().toLowerCase(Locale.ROOT);
|
||||
}
|
||||
}
|
||||
|
||||
static final ParseField NUM_LINES_ANALYZED = new ParseField("num_lines_analyzed");
|
||||
static final ParseField NUM_MESSAGES_ANALYZED = new ParseField("num_messages_analyzed");
|
||||
static final ParseField SAMPLE_START = new ParseField("sample_start");
|
||||
static final ParseField CHARSET = new ParseField("charset");
|
||||
static final ParseField HAS_BYTE_ORDER_MARKER = new ParseField("has_byte_order_marker");
|
||||
static final ParseField STRUCTURE = new ParseField("format");
|
||||
static final ParseField MULTILINE_START_PATTERN = new ParseField("multiline_start_pattern");
|
||||
static final ParseField EXCLUDE_LINES_PATTERN = new ParseField("exclude_lines_pattern");
|
||||
static final ParseField INPUT_FIELDS = new ParseField("input_fields");
|
||||
static final ParseField HAS_HEADER_ROW = new ParseField("has_header_row");
|
||||
static final ParseField SEPARATOR = new ParseField("separator");
|
||||
static final ParseField SHOULD_TRIM_FIELDS = new ParseField("should_trim_fields");
|
||||
static final ParseField GROK_PATTERN = new ParseField("grok_pattern");
|
||||
static final ParseField TIMESTAMP_FIELD = new ParseField("timestamp_field");
|
||||
static final ParseField TIMESTAMP_FORMATS = new ParseField("timestamp_formats");
|
||||
static final ParseField NEED_CLIENT_TIMEZONE = new ParseField("need_client_timezone");
|
||||
static final ParseField MAPPINGS = new ParseField("mappings");
|
||||
static final ParseField EXPLANATION = new ParseField("explanation");
|
||||
|
||||
public static final ObjectParser<Builder, Void> PARSER = new ObjectParser<>("log_file_structure", false, Builder::new);
|
||||
|
||||
static {
|
||||
PARSER.declareInt(Builder::setNumLinesAnalyzed, NUM_LINES_ANALYZED);
|
||||
PARSER.declareInt(Builder::setNumMessagesAnalyzed, NUM_MESSAGES_ANALYZED);
|
||||
PARSER.declareString(Builder::setSampleStart, SAMPLE_START);
|
||||
PARSER.declareString(Builder::setCharset, CHARSET);
|
||||
PARSER.declareBoolean(Builder::setHasByteOrderMarker, HAS_BYTE_ORDER_MARKER);
|
||||
PARSER.declareString((p, c) -> p.setFormat(Format.fromString(c)), STRUCTURE);
|
||||
PARSER.declareString(Builder::setMultilineStartPattern, MULTILINE_START_PATTERN);
|
||||
PARSER.declareString(Builder::setExcludeLinesPattern, EXCLUDE_LINES_PATTERN);
|
||||
PARSER.declareStringArray(Builder::setInputFields, INPUT_FIELDS);
|
||||
PARSER.declareBoolean(Builder::setHasHeaderRow, HAS_HEADER_ROW);
|
||||
PARSER.declareString((p, c) -> p.setSeparator(c.charAt(0)), SEPARATOR);
|
||||
PARSER.declareBoolean(Builder::setShouldTrimFields, SHOULD_TRIM_FIELDS);
|
||||
PARSER.declareString(Builder::setGrokPattern, GROK_PATTERN);
|
||||
PARSER.declareString(Builder::setTimestampField, TIMESTAMP_FIELD);
|
||||
PARSER.declareStringArray(Builder::setTimestampFormats, TIMESTAMP_FORMATS);
|
||||
PARSER.declareBoolean(Builder::setNeedClientTimezone, NEED_CLIENT_TIMEZONE);
|
||||
PARSER.declareObject(Builder::setMappings, (p, c) -> new TreeMap<>(p.map()), MAPPINGS);
|
||||
PARSER.declareStringArray(Builder::setExplanation, EXPLANATION);
|
||||
}
|
||||
|
||||
private final int numLinesAnalyzed;
|
||||
private final int numMessagesAnalyzed;
|
||||
private final String sampleStart;
|
||||
private final String charset;
|
||||
private final Boolean hasByteOrderMarker;
|
||||
private final Format format;
|
||||
private final String multilineStartPattern;
|
||||
private final String excludeLinesPattern;
|
||||
private final List<String> inputFields;
|
||||
private final Boolean hasHeaderRow;
|
||||
private final Character separator;
|
||||
private final Boolean shouldTrimFields;
|
||||
private final String grokPattern;
|
||||
private final List<String> timestampFormats;
|
||||
private final String timestampField;
|
||||
private final boolean needClientTimezone;
|
||||
private final SortedMap<String, Object> mappings;
|
||||
private final List<String> explanation;
|
||||
|
||||
public LogStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampleStart, String charset, Boolean hasByteOrderMarker,
|
||||
Format format, String multilineStartPattern, String excludeLinesPattern, List<String> inputFields,
|
||||
Boolean hasHeaderRow, Character separator, Boolean shouldTrimFields, String grokPattern, String timestampField,
|
||||
List<String> timestampFormats, boolean needClientTimezone, Map<String, Object> mappings,
|
||||
List<String> explanation) {
|
||||
|
||||
this.numLinesAnalyzed = numLinesAnalyzed;
|
||||
this.numMessagesAnalyzed = numMessagesAnalyzed;
|
||||
this.sampleStart = Objects.requireNonNull(sampleStart);
|
||||
this.charset = Objects.requireNonNull(charset);
|
||||
this.hasByteOrderMarker = hasByteOrderMarker;
|
||||
this.format = Objects.requireNonNull(format);
|
||||
this.multilineStartPattern = multilineStartPattern;
|
||||
this.excludeLinesPattern = excludeLinesPattern;
|
||||
this.inputFields = (inputFields == null) ? null : Collections.unmodifiableList(new ArrayList<>(inputFields));
|
||||
this.hasHeaderRow = hasHeaderRow;
|
||||
this.separator = separator;
|
||||
this.shouldTrimFields = shouldTrimFields;
|
||||
this.grokPattern = grokPattern;
|
||||
this.timestampField = timestampField;
|
||||
this.timestampFormats = (timestampFormats == null) ? null : Collections.unmodifiableList(new ArrayList<>(timestampFormats));
|
||||
this.needClientTimezone = needClientTimezone;
|
||||
this.mappings = Collections.unmodifiableSortedMap(new TreeMap<>(mappings));
|
||||
this.explanation = Collections.unmodifiableList(new ArrayList<>(explanation));
|
||||
}
|
||||
|
||||
public int getNumLinesAnalyzed() {
|
||||
return numLinesAnalyzed;
|
||||
}
|
||||
|
||||
public int getNumMessagesAnalyzed() {
|
||||
return numMessagesAnalyzed;
|
||||
}
|
||||
|
||||
public String getSampleStart() {
|
||||
return sampleStart;
|
||||
}
|
||||
|
||||
public String getCharset() {
|
||||
return charset;
|
||||
}
|
||||
|
||||
public Boolean getHasByteOrderMarker() {
|
||||
return hasByteOrderMarker;
|
||||
}
|
||||
|
||||
public Format getFormat() {
|
||||
return format;
|
||||
}
|
||||
|
||||
public String getMultilineStartPattern() {
|
||||
return multilineStartPattern;
|
||||
}
|
||||
|
||||
public String getExcludeLinesPattern() {
|
||||
return excludeLinesPattern;
|
||||
}
|
||||
|
||||
public List<String> getInputFields() {
|
||||
return inputFields;
|
||||
}
|
||||
|
||||
public Boolean getHasHeaderRow() {
|
||||
return hasHeaderRow;
|
||||
}
|
||||
|
||||
public Character getSeparator() {
|
||||
return separator;
|
||||
}
|
||||
|
||||
public Boolean getShouldTrimFields() {
|
||||
return shouldTrimFields;
|
||||
}
|
||||
|
||||
public String getGrokPattern() {
|
||||
return grokPattern;
|
||||
}
|
||||
|
||||
public String getTimestampField() {
|
||||
return timestampField;
|
||||
}
|
||||
|
||||
public List<String> getTimestampFormats() {
|
||||
return timestampFormats;
|
||||
}
|
||||
|
||||
public boolean needClientTimezone() {
|
||||
return needClientTimezone;
|
||||
}
|
||||
|
||||
public SortedMap<String, Object> getMappings() {
|
||||
return mappings;
|
||||
}
|
||||
|
||||
public List<String> getExplanation() {
|
||||
return explanation;
|
||||
}
|
||||
|
||||
@Override
|
||||
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
|
||||
|
||||
builder.startObject();
|
||||
builder.field(NUM_LINES_ANALYZED.getPreferredName(), numLinesAnalyzed);
|
||||
builder.field(NUM_MESSAGES_ANALYZED.getPreferredName(), numMessagesAnalyzed);
|
||||
builder.field(SAMPLE_START.getPreferredName(), sampleStart);
|
||||
builder.field(CHARSET.getPreferredName(), charset);
|
||||
if (hasByteOrderMarker != null) {
|
||||
builder.field(HAS_BYTE_ORDER_MARKER.getPreferredName(), hasByteOrderMarker.booleanValue());
|
||||
}
|
||||
builder.field(STRUCTURE.getPreferredName(), format);
|
||||
if (multilineStartPattern != null && multilineStartPattern.isEmpty() == false) {
|
||||
builder.field(MULTILINE_START_PATTERN.getPreferredName(), multilineStartPattern);
|
||||
}
|
||||
if (excludeLinesPattern != null && excludeLinesPattern.isEmpty() == false) {
|
||||
builder.field(EXCLUDE_LINES_PATTERN.getPreferredName(), excludeLinesPattern);
|
||||
}
|
||||
if (inputFields != null && inputFields.isEmpty() == false) {
|
||||
builder.field(INPUT_FIELDS.getPreferredName(), inputFields);
|
||||
}
|
||||
if (hasHeaderRow != null) {
|
||||
builder.field(HAS_HEADER_ROW.getPreferredName(), hasHeaderRow.booleanValue());
|
||||
}
|
||||
if (separator != null) {
|
||||
builder.field(SEPARATOR.getPreferredName(), String.valueOf(separator));
|
||||
}
|
||||
if (shouldTrimFields != null) {
|
||||
builder.field(SHOULD_TRIM_FIELDS.getPreferredName(), shouldTrimFields.booleanValue());
|
||||
}
|
||||
if (grokPattern != null && grokPattern.isEmpty() == false) {
|
||||
builder.field(GROK_PATTERN.getPreferredName(), grokPattern);
|
||||
}
|
||||
if (timestampField != null && timestampField.isEmpty() == false) {
|
||||
builder.field(TIMESTAMP_FIELD.getPreferredName(), timestampField);
|
||||
}
|
||||
if (timestampFormats != null && timestampFormats.isEmpty() == false) {
|
||||
builder.field(TIMESTAMP_FORMATS.getPreferredName(), timestampFormats);
|
||||
}
|
||||
builder.field(NEED_CLIENT_TIMEZONE.getPreferredName(), needClientTimezone);
|
||||
builder.field(MAPPINGS.getPreferredName(), mappings);
|
||||
builder.field(EXPLANATION.getPreferredName(), explanation);
|
||||
builder.endObject();
|
||||
|
||||
return builder;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return Objects.hash(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
|
||||
multilineStartPattern, excludeLinesPattern, inputFields, hasHeaderRow, separator, shouldTrimFields, grokPattern, timestampField,
|
||||
timestampFormats, needClientTimezone, mappings, explanation);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other == null || getClass() != other.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
LogStructure that = (LogStructure) other;
|
||||
return this.numLinesAnalyzed == that.numLinesAnalyzed &&
|
||||
this.numMessagesAnalyzed == that.numMessagesAnalyzed &&
|
||||
this.needClientTimezone == that.needClientTimezone &&
|
||||
Objects.equals(this.sampleStart, that.sampleStart) &&
|
||||
Objects.equals(this.charset, that.charset) &&
|
||||
Objects.equals(this.hasByteOrderMarker, that.hasByteOrderMarker) &&
|
||||
Objects.equals(this.format, that.format) &&
|
||||
Objects.equals(this.multilineStartPattern, that.multilineStartPattern) &&
|
||||
Objects.equals(this.excludeLinesPattern, that.excludeLinesPattern) &&
|
||||
Objects.equals(this.inputFields, that.inputFields) &&
|
||||
Objects.equals(this.hasHeaderRow, that.hasHeaderRow) &&
|
||||
Objects.equals(this.separator, that.separator) &&
|
||||
Objects.equals(this.shouldTrimFields, that.shouldTrimFields) &&
|
||||
Objects.equals(this.grokPattern, that.grokPattern) &&
|
||||
Objects.equals(this.timestampField, that.timestampField) &&
|
||||
Objects.equals(this.timestampFormats, that.timestampFormats) &&
|
||||
Objects.equals(this.mappings, that.mappings) &&
|
||||
Objects.equals(this.explanation, that.explanation);
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
|
||||
private int numLinesAnalyzed;
|
||||
private int numMessagesAnalyzed;
|
||||
private String sampleStart;
|
||||
private String charset;
|
||||
private Boolean hasByteOrderMarker;
|
||||
private Format format;
|
||||
private String multilineStartPattern;
|
||||
private String excludeLinesPattern;
|
||||
private List<String> inputFields;
|
||||
private Boolean hasHeaderRow;
|
||||
private Character separator;
|
||||
private Boolean shouldTrimFields;
|
||||
private String grokPattern;
|
||||
private String timestampField;
|
||||
private List<String> timestampFormats;
|
||||
private boolean needClientTimezone;
|
||||
private Map<String, Object> mappings;
|
||||
private List<String> explanation;
|
||||
|
||||
public Builder() {
|
||||
this(Format.SEMI_STRUCTURED_TEXT);
|
||||
}
|
||||
|
||||
public Builder(Format format) {
|
||||
setFormat(format);
|
||||
}
|
||||
|
||||
public Builder setNumLinesAnalyzed(int numLinesAnalyzed) {
|
||||
this.numLinesAnalyzed = numLinesAnalyzed;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setNumMessagesAnalyzed(int numMessagesAnalyzed) {
|
||||
this.numMessagesAnalyzed = numMessagesAnalyzed;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setSampleStart(String sampleStart) {
|
||||
this.sampleStart = Objects.requireNonNull(sampleStart);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setCharset(String charset) {
|
||||
this.charset = Objects.requireNonNull(charset);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setHasByteOrderMarker(Boolean hasByteOrderMarker) {
|
||||
this.hasByteOrderMarker = hasByteOrderMarker;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setFormat(Format format) {
|
||||
this.format = Objects.requireNonNull(format);
|
||||
this.separator = format.separator();
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setMultilineStartPattern(String multilineStartPattern) {
|
||||
this.multilineStartPattern = multilineStartPattern;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setExcludeLinesPattern(String excludeLinesPattern) {
|
||||
this.excludeLinesPattern = excludeLinesPattern;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setInputFields(List<String> inputFields) {
|
||||
this.inputFields = inputFields;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setHasHeaderRow(Boolean hasHeaderRow) {
|
||||
this.hasHeaderRow = hasHeaderRow;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setShouldTrimFields(Boolean shouldTrimFields) {
|
||||
this.shouldTrimFields = shouldTrimFields;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setSeparator(Character separator) {
|
||||
this.separator = separator;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setGrokPattern(String grokPattern) {
|
||||
this.grokPattern = grokPattern;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setTimestampField(String timestampField) {
|
||||
this.timestampField = timestampField;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setTimestampFormats(List<String> timestampFormats) {
|
||||
this.timestampFormats = timestampFormats;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setNeedClientTimezone(boolean needClientTimezone) {
|
||||
this.needClientTimezone = needClientTimezone;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setMappings(Map<String, Object> mappings) {
|
||||
this.mappings = Objects.requireNonNull(mappings);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setExplanation(List<String> explanation) {
|
||||
this.explanation = Objects.requireNonNull(explanation);
|
||||
return this;
|
||||
}
|
||||
|
||||
@SuppressWarnings("fallthrough")
|
||||
public LogStructure build() {
|
||||
|
||||
if (numLinesAnalyzed <= 0) {
|
||||
throw new IllegalArgumentException("Number of lines analyzed must be positive.");
|
||||
}
|
||||
|
||||
if (numMessagesAnalyzed <= 0) {
|
||||
throw new IllegalArgumentException("Number of messages analyzed must be positive.");
|
||||
}
|
||||
|
||||
if (numMessagesAnalyzed > numLinesAnalyzed) {
|
||||
throw new IllegalArgumentException("Number of messages analyzed cannot be greater than number of lines analyzed.");
|
||||
}
|
||||
|
||||
if (sampleStart == null || sampleStart.isEmpty()) {
|
||||
throw new IllegalArgumentException("Sample start must be specified.");
|
||||
}
|
||||
|
||||
if (charset == null || charset.isEmpty()) {
|
||||
throw new IllegalArgumentException("A character set must be specified.");
|
||||
}
|
||||
|
||||
if (charset.toUpperCase(Locale.ROOT).startsWith("UTF") == false && hasByteOrderMarker != null) {
|
||||
throw new IllegalArgumentException("A byte order marker is only possible for UTF character sets.");
|
||||
}
|
||||
|
||||
switch (format) {
|
||||
case JSON:
|
||||
if (shouldTrimFields != null) {
|
||||
throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
|
||||
}
|
||||
// $FALL-THROUGH$
|
||||
case XML:
|
||||
if (hasHeaderRow != null) {
|
||||
throw new IllegalArgumentException("Has header row may not be specified for [" + format + "] structures.");
|
||||
}
|
||||
if (separator != null) {
|
||||
throw new IllegalArgumentException("Separator may not be specified for [" + format + "] structures.");
|
||||
}
|
||||
if (grokPattern != null) {
|
||||
throw new IllegalArgumentException("Grok pattern may not be specified for [" + format + "] structures.");
|
||||
}
|
||||
break;
|
||||
case CSV:
|
||||
case TSV:
|
||||
case SEMI_COLON_SEPARATED_VALUES:
|
||||
case PIPE_SEPARATED_VALUES:
|
||||
if (inputFields == null || inputFields.isEmpty()) {
|
||||
throw new IllegalArgumentException("Input fields must be specified for [" + format + "] structures.");
|
||||
}
|
||||
if (hasHeaderRow == null) {
|
||||
throw new IllegalArgumentException("Has header row must be specified for [" + format + "] structures.");
|
||||
}
|
||||
Character expectedSeparator = format.separator();
|
||||
assert expectedSeparator != null;
|
||||
if (expectedSeparator.equals(separator) == false) {
|
||||
throw new IllegalArgumentException("Separator must be [" + expectedSeparator + "] for [" + format +
|
||||
"] structures.");
|
||||
}
|
||||
if (grokPattern != null) {
|
||||
throw new IllegalArgumentException("Grok pattern may not be specified for [" + format + "] structures.");
|
||||
}
|
||||
break;
|
||||
case SEMI_STRUCTURED_TEXT:
|
||||
if (inputFields != null) {
|
||||
throw new IllegalArgumentException("Input fields may not be specified for [" + format + "] structures.");
|
||||
}
|
||||
if (hasHeaderRow != null) {
|
||||
throw new IllegalArgumentException("Has header row may not be specified for [" + format + "] structures.");
|
||||
}
|
||||
if (separator != null) {
|
||||
throw new IllegalArgumentException("Separator may not be specified for [" + format + "] structures.");
|
||||
}
|
||||
if (shouldTrimFields != null) {
|
||||
throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
|
||||
}
|
||||
if (grokPattern == null || grokPattern.isEmpty()) {
|
||||
throw new IllegalArgumentException("Grok pattern must be specified for [" + format + "] structures.");
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("enum value [" + format + "] missing from switch.");
|
||||
}
|
||||
|
||||
if ((timestampField == null) != (timestampFormats == null || timestampFormats.isEmpty())) {
|
||||
throw new IllegalArgumentException("Timestamp field and timestamp formats must both be specified or neither be specified.");
|
||||
}
|
||||
|
||||
if (needClientTimezone && timestampField == null) {
|
||||
throw new IllegalArgumentException("Client timezone cannot be needed if there is no timestamp field.");
|
||||
}
|
||||
|
||||
if (mappings == null || mappings.isEmpty()) {
|
||||
throw new IllegalArgumentException("Mappings must be specified.");
|
||||
}
|
||||
|
||||
if (explanation == null || explanation.isEmpty()) {
|
||||
throw new IllegalArgumentException("Explanation must be specified.");
|
||||
}
|
||||
|
||||
return new LogStructure(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
|
||||
multilineStartPattern, excludeLinesPattern, inputFields, hasHeaderRow, separator, shouldTrimFields, grokPattern,
|
||||
timestampField, timestampFormats, needClientTimezone, mappings, explanation);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public interface LogStructureFinder {
|
||||
|
||||
/**
|
||||
* The (possibly multi-line) messages that the log sample was split into.
|
||||
* @return A list of messages.
|
||||
*/
|
||||
List<String> getSampleMessages();
|
||||
|
||||
/**
|
||||
* Retrieve the structure of the log file used to instantiate the finder.
|
||||
* @return The log file structure.
|
||||
*/
|
||||
LogStructure getStructure();
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public interface LogStructureFinderFactory {
|
||||
|
||||
/**
|
||||
* Given a sample of a log file, decide whether this factory will be able
|
||||
* to create an appropriate object to represent its ingestion configs.
|
||||
* @param explanation List of reasons for making decisions. May contain items when passed and new reasons
|
||||
* can be appended by this method.
|
||||
* @param sample A sample from the log file to be ingested.
|
||||
* @return <code>true</code> if this factory can create an appropriate log
|
||||
* file structure given the sample; otherwise <code>false</code>.
|
||||
*/
|
||||
boolean canCreateFromSample(List<String> explanation, String sample);
|
||||
|
||||
/**
|
||||
* Create an object representing the structure of a log file.
|
||||
* @param explanation List of reasons for making decisions. May contain items when passed and new reasons
|
||||
* can be appended by this method.
|
||||
* @param sample A sample from the log file to be ingested.
|
||||
* @param charsetName The name of the character set in which the sample was provided.
|
||||
* @param hasByteOrderMarker Did the sample have a byte order marker? <code>null</code> means "not relevant".
|
||||
* @return A log file structure object suitable for ingesting the supplied sample.
|
||||
* @throws Exception if something goes wrong during creation.
|
||||
*/
|
||||
LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
|
||||
throws Exception;
|
||||
}
|
|
@ -0,0 +1,232 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import com.ibm.icu.text.CharsetDetector;
|
||||
import com.ibm.icu.text.CharsetMatch;
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Runs the high-level steps needed to create ingest configs for the specified log file. In order:
|
||||
* 1. Determine the most likely character set (UTF-8, UTF-16LE, ISO-8859-2, etc.)
|
||||
* 2. Load a sample of the file, consisting of the first 1000 lines of the file
|
||||
* 3. Determine the most likely file structure - one of ND-JSON, XML, CSV, TSV or semi-structured text
|
||||
* 4. Create an appropriate structure object and delegate writing configs to it
|
||||
*/
|
||||
public final class LogStructureFinderManager {
|
||||
|
||||
public static final int MIN_SAMPLE_LINE_COUNT = 2;
|
||||
|
||||
static final Set<String> FILEBEAT_SUPPORTED_ENCODINGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
|
||||
"866", "ansi_x3.4-1968", "arabic", "ascii", "asmo-708", "big5", "big5-hkscs", "chinese", "cn-big5", "cp1250", "cp1251", "cp1252",
|
||||
"cp1253", "cp1254", "cp1255", "cp1256", "cp1257", "cp1258", "cp819", "cp866", "csbig5", "cseuckr", "cseucpkdfmtjapanese",
|
||||
"csgb2312", "csibm866", "csiso2022jp", "csiso2022kr", "csiso58gb231280", "csiso88596e", "csiso88596i", "csiso88598e", "csiso88598i",
|
||||
"csisolatin1", "csisolatin2", "csisolatin3", "csisolatin4", "csisolatin5", "csisolatin6", "csisolatin9", "csisolatinarabic",
|
||||
"csisolatincyrillic", "csisolatingreek", "csisolatinhebrew", "cskoi8r", "csksc56011987", "csmacintosh", "csshiftjis", "cyrillic",
|
||||
"dos-874", "ecma-114", "ecma-118", "elot_928", "euc-jp", "euc-kr", "gb18030", "gb2312", "gb_2312", "gb_2312-80", "gbk", "greek",
|
||||
"greek8", "hebrew", "hz-gb-2312", "ibm819", "ibm866", "iso-2022-cn", "iso-2022-cn-ext", "iso-2022-jp", "iso-2022-kr", "iso-8859-1",
|
||||
"iso-8859-10", "iso-8859-11", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "iso-8859-2", "iso-8859-3", "iso-8859-4",
|
||||
"iso-8859-5", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", "iso-8859-7", "iso-8859-8", "iso-8859-8-e", "iso-8859-8-i",
|
||||
"iso-8859-9", "iso-ir-100", "iso-ir-101", "iso-ir-109", "iso-ir-110", "iso-ir-126", "iso-ir-127", "iso-ir-138", "iso-ir-144",
|
||||
"iso-ir-148", "iso-ir-149", "iso-ir-157", "iso-ir-58", "iso8859-1", "iso8859-10", "iso8859-11", "iso8859-13", "iso8859-14",
|
||||
"iso8859-15", "iso8859-2", "iso8859-3", "iso8859-4", "iso8859-5", "iso8859-6", "iso8859-6e", "iso8859-6i", "iso8859-7", "iso8859-8",
|
||||
"iso8859-8e", "iso8859-8i", "iso8859-9", "iso88591", "iso885910", "iso885911", "iso885913", "iso885914", "iso885915", "iso88592",
|
||||
"iso88593", "iso88594", "iso88595", "iso88596", "iso88597", "iso88598", "iso88599", "iso_8859-1", "iso_8859-15", "iso_8859-1:1987",
|
||||
"iso_8859-2", "iso_8859-2:1987", "iso_8859-3", "iso_8859-3:1988", "iso_8859-4", "iso_8859-4:1988", "iso_8859-5", "iso_8859-5:1988",
|
||||
"iso_8859-6", "iso_8859-6:1987", "iso_8859-7", "iso_8859-7:1987", "iso_8859-8", "iso_8859-8:1988", "iso_8859-9", "iso_8859-9:1989",
|
||||
"koi", "koi8", "koi8-r", "koi8-ru", "koi8-u", "koi8_r", "korean", "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "l1",
|
||||
"l2", "l3", "l4", "l5", "l6", "l9", "latin1", "latin2", "latin3", "latin4", "latin5", "latin6", "logical", "mac", "macintosh",
|
||||
"ms932", "ms_kanji", "shift-jis", "shift_jis", "sjis", "sun_eu_greek", "tis-620", "unicode-1-1-utf-8", "us-ascii", "utf-16",
|
||||
"utf-16-bom", "utf-16be", "utf-16be-bom", "utf-16le", "utf-16le-bom", "utf-8", "utf8", "visual", "windows-1250", "windows-1251",
|
||||
"windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "windows-31j",
|
||||
"windows-874", "windows-949", "x-cp1250", "x-cp1251", "x-cp1252", "x-cp1253", "x-cp1254", "x-cp1255", "x-cp1256", "x-cp1257",
|
||||
"x-cp1258", "x-euc-jp", "x-gbk", "x-mac-cyrillic", "x-mac-roman", "x-mac-ukrainian", "x-sjis", "x-x-big5"
|
||||
)));
|
||||
|
||||
/**
|
||||
* These need to be ordered so that the more generic formats come after the more specific ones
|
||||
*/
|
||||
private static final List<LogStructureFinderFactory> ORDERED_STRUCTURE_FACTORIES = Collections.unmodifiableList(Arrays.asList(
|
||||
new JsonLogStructureFinderFactory(),
|
||||
new XmlLogStructureFinderFactory(),
|
||||
// ND-JSON will often also be valid (although utterly weird) CSV, so JSON must come before CSV
|
||||
new CsvLogStructureFinderFactory(),
|
||||
new TsvLogStructureFinderFactory(),
|
||||
new SemiColonSeparatedValuesLogStructureFinderFactory(),
|
||||
new PipeSeparatedValuesLogStructureFinderFactory(),
|
||||
new TextLogStructureFinderFactory()
|
||||
));
|
||||
|
||||
private static final int BUFFER_SIZE = 8192;
|
||||
|
||||
/**
|
||||
* Given a stream of data from some log file, determine its structure.
|
||||
* @param idealSampleLineCount Ideally, how many lines from the stream will be read to determine the structure?
|
||||
* If the stream has fewer lines then an attempt will still be made, providing at
|
||||
* least {@link #MIN_SAMPLE_LINE_COUNT} lines can be read.
|
||||
* @param fromFile A stream from which the sample will be read.
|
||||
* @return A {@link LogStructureFinder} object from which the structure and messages can be queried.
|
||||
* @throws Exception A variety of problems could occur at various stages of the structure finding process.
|
||||
*/
|
||||
public LogStructureFinder findLogStructure(int idealSampleLineCount, InputStream fromFile) throws Exception {
|
||||
return findLogStructure(new ArrayList<>(), idealSampleLineCount, fromFile);
|
||||
}
|
||||
|
||||
public LogStructureFinder findLogStructure(List<String> explanation, int idealSampleLineCount, InputStream fromFile)
|
||||
throws Exception {
|
||||
|
||||
CharsetMatch charsetMatch = findCharset(explanation, fromFile);
|
||||
String charsetName = charsetMatch.getName();
|
||||
|
||||
Tuple<String, Boolean> sampleInfo = sampleFile(charsetMatch.getReader(), charsetName, MIN_SAMPLE_LINE_COUNT,
|
||||
Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount));
|
||||
|
||||
return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2());
|
||||
}
|
||||
|
||||
CharsetMatch findCharset(List<String> explanation, InputStream inputStream) throws Exception {
|
||||
|
||||
// We need an input stream that supports mark and reset, so wrap the argument
|
||||
// in a BufferedInputStream if it doesn't already support this feature
|
||||
if (inputStream.markSupported() == false) {
|
||||
inputStream = new BufferedInputStream(inputStream, BUFFER_SIZE);
|
||||
}
|
||||
|
||||
// This is from ICU4J
|
||||
CharsetDetector charsetDetector = new CharsetDetector().setText(inputStream);
|
||||
CharsetMatch[] charsetMatches = charsetDetector.detectAll();
|
||||
|
||||
// Determine some extra characteristics of the input to compensate for some deficiencies of ICU4J
|
||||
boolean pureAscii = true;
|
||||
boolean containsZeroBytes = false;
|
||||
inputStream.mark(BUFFER_SIZE);
|
||||
byte[] workspace = new byte[BUFFER_SIZE];
|
||||
int remainingLength = BUFFER_SIZE;
|
||||
do {
|
||||
int bytesRead = inputStream.read(workspace, 0, remainingLength);
|
||||
if (bytesRead <= 0) {
|
||||
break;
|
||||
}
|
||||
for (int i = 0; i < bytesRead && containsZeroBytes == false; ++i) {
|
||||
if (workspace[i] == 0) {
|
||||
containsZeroBytes = true;
|
||||
pureAscii = false;
|
||||
} else {
|
||||
pureAscii = pureAscii && workspace[i] > 0 && workspace[i] < 128;
|
||||
}
|
||||
}
|
||||
remainingLength -= bytesRead;
|
||||
} while (containsZeroBytes == false && remainingLength > 0);
|
||||
inputStream.reset();
|
||||
|
||||
if (pureAscii) {
|
||||
// If the input is pure ASCII then many single byte character sets will match. We want to favour
|
||||
// UTF-8 in this case, as it avoids putting a bold declaration of a dubious character set choice
|
||||
// in the config files.
|
||||
Optional<CharsetMatch> utf8CharsetMatch = Arrays.stream(charsetMatches)
|
||||
.filter(charsetMatch -> StandardCharsets.UTF_8.name().equals(charsetMatch.getName())).findFirst();
|
||||
if (utf8CharsetMatch.isPresent()) {
|
||||
explanation.add("Using character encoding [" + StandardCharsets.UTF_8.name() +
|
||||
"], which matched the input with [" + utf8CharsetMatch.get().getConfidence() + "%] confidence - first [" +
|
||||
(BUFFER_SIZE / 1024) + "kB] of input was pure ASCII");
|
||||
return utf8CharsetMatch.get();
|
||||
}
|
||||
}
|
||||
|
||||
// Input wasn't pure ASCII, so use the best matching character set that's supported by both Java and Go.
|
||||
// Additionally, if the input contains zero bytes then avoid single byte character sets, as ICU4J will
|
||||
// suggest these for binary files but then
|
||||
for (CharsetMatch charsetMatch : charsetMatches) {
|
||||
String name = charsetMatch.getName();
|
||||
if (Charset.isSupported(name) && FILEBEAT_SUPPORTED_ENCODINGS.contains(name.toLowerCase(Locale.ROOT))) {
|
||||
|
||||
// This extra test is to avoid trying to read binary files as text. Running the log config
|
||||
// deduction algorithms on binary files is very slow as the binary files generally appear to
|
||||
// have very long lines.
|
||||
boolean spaceEncodingContainsZeroByte = false;
|
||||
byte[] spaceBytes = " ".getBytes(name);
|
||||
for (int i = 0; i < spaceBytes.length && spaceEncodingContainsZeroByte == false; ++i) {
|
||||
spaceEncodingContainsZeroByte = (spaceBytes[i] == 0);
|
||||
}
|
||||
if (containsZeroBytes && spaceEncodingContainsZeroByte == false) {
|
||||
explanation.add("Character encoding [" + name + "] matched the input with [" + charsetMatch.getConfidence() +
|
||||
"%] confidence but was rejected as the input contains zero bytes and the [" + name + "] encoding does not");
|
||||
} else {
|
||||
explanation.add("Using character encoding [" + name + "], which matched the input with [" +
|
||||
charsetMatch.getConfidence() + "%] confidence");
|
||||
return charsetMatch;
|
||||
}
|
||||
} else {
|
||||
explanation.add("Character encoding [" + name + "] matched the input with [" + charsetMatch.getConfidence() +
|
||||
"%] confidence but was rejected as it is not supported by [" +
|
||||
(Charset.isSupported(name) ? "Filebeat" : "the JVM") + "]");
|
||||
}
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("Could not determine a usable character encoding for the input" +
|
||||
(containsZeroBytes ? " - could it be binary data?" : ""));
|
||||
}
|
||||
|
||||
LogStructureFinder makeBestStructureFinder(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
|
||||
throws Exception {
|
||||
|
||||
for (LogStructureFinderFactory factory : ORDERED_STRUCTURE_FACTORIES) {
|
||||
if (factory.canCreateFromSample(explanation, sample)) {
|
||||
return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker);
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("Input did not match any known formats");
|
||||
}
|
||||
|
||||
private Tuple<String, Boolean> sampleFile(Reader reader, String charsetName, int minLines, int maxLines) throws IOException {
|
||||
|
||||
int lineCount = 0;
|
||||
BufferedReader bufferedReader = new BufferedReader(reader);
|
||||
StringBuilder sample = new StringBuilder();
|
||||
|
||||
// Don't include any byte-order-marker in the sample. (The logic to skip it works for both
|
||||
// UTF-8 and UTF-16 assuming the character set of the reader was correctly detected.)
|
||||
Boolean hasByteOrderMarker = null;
|
||||
if (charsetName.toUpperCase(Locale.ROOT).startsWith("UTF")) {
|
||||
int maybeByteOrderMarker = reader.read();
|
||||
hasByteOrderMarker = ((char) maybeByteOrderMarker == '\uFEFF');
|
||||
if (maybeByteOrderMarker >= 0 && hasByteOrderMarker == false && (char) maybeByteOrderMarker != '\r')
|
||||
{
|
||||
sample.appendCodePoint(maybeByteOrderMarker);
|
||||
if ((char) maybeByteOrderMarker == '\n') {
|
||||
++lineCount;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String line;
|
||||
while ((line = bufferedReader.readLine()) != null && ++lineCount <= maxLines) {
|
||||
sample.append(line).append('\n');
|
||||
}
|
||||
|
||||
if (lineCount < minLines) {
|
||||
throw new IllegalArgumentException("Input contained too few lines to sample");
|
||||
}
|
||||
|
||||
return new Tuple<>(sample.toString(), hasByteOrderMarker);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,238 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.grok.Grok;
|
||||
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
final class LogStructureUtils {
|
||||
|
||||
static final String DEFAULT_TIMESTAMP_FIELD = "@timestamp";
|
||||
static final String MAPPING_TYPE_SETTING = "type";
|
||||
static final String MAPPING_FORMAT_SETTING = "format";
|
||||
static final String MAPPING_PROPERTIES_SETTING = "properties";
|
||||
|
||||
// NUMBER Grok pattern doesn't support scientific notation, so we extend it
|
||||
private static final Grok NUMBER_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{NUMBER}(?:[eE][+-]?[0-3]?[0-9]{1,2})?$");
|
||||
private static final Grok IP_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{IP}$");
|
||||
private static final int KEYWORD_MAX_LEN = 256;
|
||||
private static final int KEYWORD_MAX_SPACES = 5;
|
||||
|
||||
private LogStructureUtils() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Given one or more sample records, find a timestamp field that is consistently present in them all.
|
||||
* To be returned the timestamp field:
|
||||
* - Must exist in every record
|
||||
* - Must have the same timestamp format in every record
|
||||
* If multiple fields meet these criteria then the one that occurred first in the first sample record
|
||||
* is chosen.
|
||||
* @param explanation List of reasons for choosing the overall log structure. This list
|
||||
* may be non-empty when the method is called, and this method may
|
||||
* append to it.
|
||||
* @param sampleRecords List of records derived from the provided log sample.
|
||||
* @return A tuple of (field name, timestamp format) if one can be found, or <code>null</code> if
|
||||
* there is no consistent timestamp.
|
||||
*/
|
||||
static Tuple<String, TimestampMatch> guessTimestampField(List<String> explanation, List<Map<String, ?>> sampleRecords) {
|
||||
if (sampleRecords.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Accept the first match from the first sample that is compatible with all the other samples
|
||||
for (Tuple<String, TimestampMatch> candidate : findCandidates(explanation, sampleRecords)) {
|
||||
|
||||
boolean allGood = true;
|
||||
for (Map<String, ?> sampleRecord : sampleRecords.subList(1, sampleRecords.size())) {
|
||||
Object fieldValue = sampleRecord.get(candidate.v1());
|
||||
if (fieldValue == null) {
|
||||
explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord +
|
||||
"] doesn't have field");
|
||||
allGood = false;
|
||||
break;
|
||||
}
|
||||
|
||||
TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(fieldValue.toString());
|
||||
if (match == null || match.candidateIndex != candidate.v2().candidateIndex) {
|
||||
explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord +
|
||||
"] matches differently: [" + match + "]");
|
||||
allGood = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (allGood) {
|
||||
explanation.add("Guessing timestamp field is [" + candidate.v1() + "] with format [" + candidate.v2() + "]");
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static List<Tuple<String, TimestampMatch>> findCandidates(List<String> explanation, List<Map<String, ?>> sampleRecords) {
|
||||
|
||||
List<Tuple<String, TimestampMatch>> candidates = new ArrayList<>();
|
||||
|
||||
// Get candidate timestamps from the first sample record
|
||||
for (Map.Entry<String, ?> entry : sampleRecords.get(0).entrySet()) {
|
||||
Object value = entry.getValue();
|
||||
if (value != null) {
|
||||
TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(value.toString());
|
||||
if (match != null) {
|
||||
Tuple<String, TimestampMatch> candidate = new Tuple<>(entry.getKey(), match);
|
||||
candidates.add(candidate);
|
||||
explanation.add("First sample timestamp match [" + candidate + "]");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return candidates;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the sampled records, guess appropriate Elasticsearch mappings.
|
||||
* @param sampleRecords The sampled records.
|
||||
* @return A map of field name to mapping settings.
|
||||
*/
|
||||
static SortedMap<String, Object> guessMappings(List<String> explanation, List<Map<String, ?>> sampleRecords) {
|
||||
|
||||
SortedMap<String, Object> mappings = new TreeMap<>();
|
||||
|
||||
for (Map<String, ?> sampleRecord : sampleRecords) {
|
||||
for (String fieldName : sampleRecord.keySet()) {
|
||||
mappings.computeIfAbsent(fieldName, key -> guessMapping(explanation, fieldName,
|
||||
sampleRecords.stream().flatMap(record -> {
|
||||
Object fieldValue = record.get(fieldName);
|
||||
return (fieldValue == null) ? Stream.empty() : Stream.of(fieldValue);
|
||||
}
|
||||
).collect(Collectors.toList())));
|
||||
}
|
||||
}
|
||||
|
||||
return mappings;
|
||||
}
|
||||
|
||||
static Map<String, String> guessMapping(List<String> explanation, String fieldName, List<Object> fieldValues) {
|
||||
|
||||
if (fieldValues == null || fieldValues.isEmpty()) {
|
||||
// We can get here if all the records that contained a given field had a null value for it.
|
||||
// In this case it's best not to make any statement about what the mapping type should be.
|
||||
return null;
|
||||
}
|
||||
|
||||
if (fieldValues.stream().anyMatch(value -> value instanceof Map)) {
|
||||
if (fieldValues.stream().allMatch(value -> value instanceof Map)) {
|
||||
return Collections.singletonMap(MAPPING_TYPE_SETTING, "object");
|
||||
}
|
||||
throw new IllegalArgumentException("Field [" + fieldName +
|
||||
"] has both object and non-object values - this is not supported by Elasticsearch");
|
||||
}
|
||||
|
||||
if (fieldValues.stream().anyMatch(value -> value instanceof List || value instanceof Object[])) {
|
||||
// Elasticsearch fields can be either arrays or single values, but array values must all have the same type
|
||||
return guessMapping(explanation, fieldName,
|
||||
fieldValues.stream().flatMap(LogStructureUtils::flatten).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
return guessScalarMapping(explanation, fieldName, fieldValues.stream().map(Object::toString).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
private static Stream<Object> flatten(Object value) {
|
||||
if (value instanceof List) {
|
||||
@SuppressWarnings("unchecked")
|
||||
List<Object> objectList = (List<Object>) value;
|
||||
return objectList.stream();
|
||||
} else if (value instanceof Object[]) {
|
||||
return Arrays.stream((Object[]) value);
|
||||
} else {
|
||||
return Stream.of(value);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given some sample values for a field, guess the most appropriate index mapping for the
|
||||
* field.
|
||||
* @param explanation List of reasons for choosing the overall log structure. This list
|
||||
* may be non-empty when the method is called, and this method may
|
||||
* append to it.
|
||||
* @param fieldName Name of the field for which mappings are to be guessed.
|
||||
* @param fieldValues Values of the field for which mappings are to be guessed. The guessed
|
||||
* mapping will be compatible with all the provided values. Must not be
|
||||
* empty.
|
||||
* @return The sub-section of the index mappings most appropriate for the field,
|
||||
* for example <code>{ "type" : "keyword" }</code>.
|
||||
*/
|
||||
static Map<String, String> guessScalarMapping(List<String> explanation, String fieldName, Collection<String> fieldValues) {
|
||||
|
||||
assert fieldValues.isEmpty() == false;
|
||||
|
||||
if (fieldValues.stream().allMatch(value -> "true".equals(value) || "false".equals(value))) {
|
||||
return Collections.singletonMap(MAPPING_TYPE_SETTING, "boolean");
|
||||
}
|
||||
|
||||
// This checks if a date mapping would be appropriate, and, if so, finds the correct format
|
||||
Iterator<String> iter = fieldValues.iterator();
|
||||
TimestampMatch timestampMatch = TimestampFormatFinder.findFirstFullMatch(iter.next());
|
||||
while (timestampMatch != null && iter.hasNext()) {
|
||||
// To be mapped as type date all the values must match the same date format - it is
|
||||
// not acceptable for all values to be dates, but with different formats
|
||||
if (timestampMatch.equals(TimestampFormatFinder.findFirstFullMatch(iter.next(), timestampMatch.candidateIndex)) == false) {
|
||||
timestampMatch = null;
|
||||
}
|
||||
}
|
||||
if (timestampMatch != null) {
|
||||
return timestampMatch.getEsDateMappingTypeWithFormat();
|
||||
}
|
||||
|
||||
if (fieldValues.stream().allMatch(NUMBER_GROK::match)) {
|
||||
try {
|
||||
fieldValues.forEach(Long::parseLong);
|
||||
return Collections.singletonMap(MAPPING_TYPE_SETTING, "long");
|
||||
} catch (NumberFormatException e) {
|
||||
explanation.add("Rejecting type 'long' for field [" + fieldName + "] due to parse failure: [" + e.getMessage() + "]");
|
||||
}
|
||||
try {
|
||||
fieldValues.forEach(Double::parseDouble);
|
||||
return Collections.singletonMap(MAPPING_TYPE_SETTING, "double");
|
||||
} catch (NumberFormatException e) {
|
||||
explanation.add("Rejecting type 'double' for field [" + fieldName + "] due to parse failure: [" + e.getMessage() + "]");
|
||||
}
|
||||
}
|
||||
else if (fieldValues.stream().allMatch(IP_GROK::match)) {
|
||||
return Collections.singletonMap(MAPPING_TYPE_SETTING, "ip");
|
||||
}
|
||||
|
||||
if (fieldValues.stream().anyMatch(LogStructureUtils::isMoreLikelyTextThanKeyword)) {
|
||||
return Collections.singletonMap(MAPPING_TYPE_SETTING, "text");
|
||||
}
|
||||
|
||||
return Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword");
|
||||
}
|
||||
|
||||
/**
|
||||
* The thinking is that the longer the field value and the more spaces it contains,
|
||||
* the more likely it is that it should be indexed as text rather than keyword.
|
||||
*/
|
||||
static boolean isMoreLikelyTextThanKeyword(String str) {
|
||||
int length = str.length();
|
||||
return length > KEYWORD_MAX_LEN || length - str.replaceAll("\\s", "").length() > KEYWORD_MAX_SPACES;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.supercsv.prefs.CsvPreference;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public class PipeSeparatedValuesLogStructureFinderFactory implements LogStructureFinderFactory {
|
||||
|
||||
private static final CsvPreference PIPE_PREFERENCE = new CsvPreference.Builder('"', '|', "\n").build();
|
||||
|
||||
/**
|
||||
* Rules are:
|
||||
* - The file must be valid pipe (<code>|</code>) separated values
|
||||
* - It must contain at least two complete records
|
||||
* - There must be at least five fields per record (otherwise files with coincidental
|
||||
* or no pipe characters could be treated as pipe separated)
|
||||
* - Every pipe separated value record except the last must have the same number of fields
|
||||
* The reason the last record is allowed to have fewer fields than the others is that
|
||||
* it could have been truncated when the file was sampled.
|
||||
*/
|
||||
@Override
|
||||
public boolean canCreateFromSample(List<String> explanation, String sample) {
|
||||
return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 5, PIPE_PREFERENCE, "pipe separated values");
|
||||
}
|
||||
|
||||
@Override
|
||||
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
|
||||
throws IOException {
|
||||
return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
|
||||
PIPE_PREFERENCE, true);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.supercsv.prefs.CsvPreference;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public class SemiColonSeparatedValuesLogStructureFinderFactory implements LogStructureFinderFactory {
|
||||
|
||||
/**
|
||||
* Rules are:
|
||||
* - The file must be valid semi-colon separated values
|
||||
* - It must contain at least two complete records
|
||||
* - There must be at least four fields per record (otherwise files with coincidental
|
||||
* or no semi-colons could be treated as semi-colon separated)
|
||||
* - Every semi-colon separated value record except the last must have the same number of fields
|
||||
* The reason the last record is allowed to have fewer fields than the others is that
|
||||
* it could have been truncated when the file was sampled.
|
||||
*/
|
||||
@Override
|
||||
public boolean canCreateFromSample(List<String> explanation, String sample) {
|
||||
return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 4,
|
||||
CsvPreference.EXCEL_NORTH_EUROPE_PREFERENCE, "semi-colon separated values");
|
||||
}
|
||||
|
||||
@Override
|
||||
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
|
||||
throws IOException {
|
||||
return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
|
||||
CsvPreference.EXCEL_NORTH_EUROPE_PREFERENCE, false);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,486 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
|
||||
import org.supercsv.exception.SuperCsvException;
|
||||
import org.supercsv.io.CsvListReader;
|
||||
import org.supercsv.prefs.CsvPreference;
|
||||
import org.supercsv.util.Util;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.DoubleSummaryStatistics;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.SortedMap;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
public class SeparatedValuesLogStructureFinder implements LogStructureFinder {
|
||||
|
||||
private static final int MAX_LEVENSHTEIN_COMPARISONS = 100;
|
||||
|
||||
private final List<String> sampleMessages;
|
||||
private final LogStructure structure;
|
||||
|
||||
static SeparatedValuesLogStructureFinder makeSeparatedValuesLogStructureFinder(List<String> explanation, String sample,
|
||||
String charsetName, Boolean hasByteOrderMarker,
|
||||
CsvPreference csvPreference, boolean trimFields)
|
||||
throws IOException {
|
||||
|
||||
Tuple<List<List<String>>, List<Integer>> parsed = readRows(sample, csvPreference);
|
||||
List<List<String>> rows = parsed.v1();
|
||||
List<Integer> lineNumbers = parsed.v2();
|
||||
|
||||
Tuple<Boolean, String[]> headerInfo = findHeaderFromSample(explanation, rows);
|
||||
boolean isHeaderInFile = headerInfo.v1();
|
||||
String[] header = headerInfo.v2();
|
||||
String[] headerWithNamedBlanks = new String[header.length];
|
||||
for (int i = 0; i < header.length; ++i) {
|
||||
String rawHeader = header[i].isEmpty() ? "column" + (i + 1) : header[i];
|
||||
headerWithNamedBlanks[i] = trimFields ? rawHeader.trim() : rawHeader;
|
||||
}
|
||||
|
||||
List<String> sampleLines = Arrays.asList(sample.split("\n"));
|
||||
List<String> sampleMessages = new ArrayList<>();
|
||||
List<Map<String, ?>> sampleRecords = new ArrayList<>();
|
||||
int prevMessageEndLineNumber = isHeaderInFile ? lineNumbers.get(0) : -1;
|
||||
for (int index = isHeaderInFile ? 1 : 0; index < rows.size(); ++index) {
|
||||
List<String> row = rows.get(index);
|
||||
int lineNumber = lineNumbers.get(index);
|
||||
Map<String, String> sampleRecord = new LinkedHashMap<>();
|
||||
Util.filterListToMap(sampleRecord, headerWithNamedBlanks,
|
||||
trimFields ? row.stream().map(String::trim).collect(Collectors.toList()) : row);
|
||||
sampleRecords.add(sampleRecord);
|
||||
sampleMessages.add(
|
||||
sampleLines.subList(prevMessageEndLineNumber + 1, lineNumbers.get(index)).stream().collect(Collectors.joining("\n")));
|
||||
prevMessageEndLineNumber = lineNumber;
|
||||
}
|
||||
|
||||
String preamble = Pattern.compile("\n").splitAsStream(sample).limit(lineNumbers.get(1)).collect(Collectors.joining("\n", "", "\n"));
|
||||
|
||||
char delimiter = (char) csvPreference.getDelimiterChar();
|
||||
LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.fromSeparator(delimiter))
|
||||
.setCharset(charsetName)
|
||||
.setHasByteOrderMarker(hasByteOrderMarker)
|
||||
.setSampleStart(preamble)
|
||||
.setNumLinesAnalyzed(lineNumbers.get(lineNumbers.size() - 1))
|
||||
.setNumMessagesAnalyzed(sampleRecords.size())
|
||||
.setHasHeaderRow(isHeaderInFile)
|
||||
.setInputFields(Arrays.stream(headerWithNamedBlanks).collect(Collectors.toList()));
|
||||
|
||||
if (trimFields) {
|
||||
structureBuilder.setShouldTrimFields(true);
|
||||
}
|
||||
|
||||
Tuple<String, TimestampMatch> timeField = LogStructureUtils.guessTimestampField(explanation, sampleRecords);
|
||||
if (timeField != null) {
|
||||
String timeLineRegex = null;
|
||||
StringBuilder builder = new StringBuilder("^");
|
||||
// We make the assumption that the timestamp will be on the first line of each record. Therefore, if the
|
||||
// timestamp is the last column then either our assumption is wrong (and the approach will completely
|
||||
// break down) or else every record is on a single line and there's no point creating a multiline config.
|
||||
// This is why the loop excludes the last column.
|
||||
for (String column : Arrays.asList(header).subList(0, header.length - 1)) {
|
||||
if (timeField.v1().equals(column)) {
|
||||
builder.append("\"?");
|
||||
String simpleTimePattern = timeField.v2().simplePattern.pattern();
|
||||
builder.append(simpleTimePattern.startsWith("\\b") ? simpleTimePattern.substring(2) : simpleTimePattern);
|
||||
timeLineRegex = builder.toString();
|
||||
break;
|
||||
} else {
|
||||
builder.append(".*?");
|
||||
if (delimiter == '\t') {
|
||||
builder.append("\\t");
|
||||
} else {
|
||||
builder.append(delimiter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (isHeaderInFile) {
|
||||
structureBuilder.setExcludeLinesPattern("^" + Arrays.stream(header)
|
||||
.map(column -> "\"?" + column.replace("\"", "\"\"").replaceAll("([\\\\|()\\[\\]{}^$*?])", "\\\\$1") + "\"?")
|
||||
.collect(Collectors.joining(",")));
|
||||
}
|
||||
|
||||
structureBuilder.setTimestampField(timeField.v1())
|
||||
.setTimestampFormats(timeField.v2().dateFormats)
|
||||
.setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing())
|
||||
.setMultilineStartPattern(timeLineRegex);
|
||||
}
|
||||
|
||||
SortedMap<String, Object> mappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
|
||||
mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
|
||||
|
||||
LogStructure structure = structureBuilder
|
||||
.setMappings(mappings)
|
||||
.setExplanation(explanation)
|
||||
.build();
|
||||
|
||||
return new SeparatedValuesLogStructureFinder(sampleMessages, structure);
|
||||
}
|
||||
|
||||
private SeparatedValuesLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
|
||||
this.sampleMessages = Collections.unmodifiableList(sampleMessages);
|
||||
this.structure = structure;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getSampleMessages() {
|
||||
return sampleMessages;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LogStructure getStructure() {
|
||||
return structure;
|
||||
}
|
||||
|
||||
static Tuple<List<List<String>>, List<Integer>> readRows(String sample, CsvPreference csvPreference) throws IOException {
|
||||
|
||||
int fieldsInFirstRow = -1;
|
||||
|
||||
List<List<String>> rows = new ArrayList<>();
|
||||
List<Integer> lineNumbers = new ArrayList<>();
|
||||
|
||||
try (CsvListReader csvReader = new CsvListReader(new StringReader(sample), csvPreference)) {
|
||||
|
||||
try {
|
||||
List<String> row;
|
||||
while ((row = csvReader.read()) != null) {
|
||||
if (fieldsInFirstRow < 0) {
|
||||
fieldsInFirstRow = row.size();
|
||||
} else {
|
||||
// Tolerate extra columns if and only if they're empty
|
||||
while (row.size() > fieldsInFirstRow && row.get(row.size() - 1) == null) {
|
||||
row.remove(row.size() - 1);
|
||||
}
|
||||
}
|
||||
rows.add(row);
|
||||
lineNumbers.add(csvReader.getLineNumber());
|
||||
}
|
||||
} catch (SuperCsvException e) {
|
||||
// Tolerate an incomplete last row
|
||||
if (notUnexpectedEndOfFile(e)) {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert rows.isEmpty() == false;
|
||||
assert lineNumbers.size() == rows.size();
|
||||
|
||||
if (rows.get(0).size() != rows.get(rows.size() - 1).size()) {
|
||||
rows.remove(rows.size() - 1);
|
||||
lineNumbers.remove(lineNumbers.size() - 1);
|
||||
}
|
||||
|
||||
// This should have been enforced by canCreateFromSample()
|
||||
assert rows.size() > 1;
|
||||
|
||||
return new Tuple<>(rows, lineNumbers);
|
||||
}
|
||||
|
||||
static Tuple<Boolean, String[]> findHeaderFromSample(List<String> explanation, List<List<String>> rows) {
|
||||
|
||||
assert rows.isEmpty() == false;
|
||||
|
||||
List<String> firstRow = rows.get(0);
|
||||
|
||||
boolean isHeaderInFile = true;
|
||||
if (rowContainsDuplicateNonEmptyValues(firstRow)) {
|
||||
isHeaderInFile = false;
|
||||
explanation.add("First row contains duplicate values, so assuming it's not a header");
|
||||
} else {
|
||||
if (rows.size() < 3) {
|
||||
explanation.add("Too little data to accurately assess whether header is in sample - guessing it is");
|
||||
} else {
|
||||
isHeaderInFile = isFirstRowUnusual(explanation, rows);
|
||||
}
|
||||
}
|
||||
|
||||
if (isHeaderInFile) {
|
||||
// SuperCSV will put nulls in the header if any columns don't have names, but empty strings are better for us
|
||||
return new Tuple<>(true, firstRow.stream().map(field -> (field == null) ? "" : field).toArray(String[]::new));
|
||||
} else {
|
||||
return new Tuple<>(false, IntStream.rangeClosed(1, firstRow.size()).mapToObj(num -> "column" + num).toArray(String[]::new));
|
||||
}
|
||||
}
|
||||
|
||||
static boolean rowContainsDuplicateNonEmptyValues(List<String> row) {
|
||||
|
||||
HashSet<String> values = new HashSet<>();
|
||||
|
||||
for (String value : row) {
|
||||
if (value != null && value.isEmpty() == false && values.add(value) == false) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private static boolean isFirstRowUnusual(List<String> explanation, List<List<String>> rows) {
|
||||
|
||||
assert rows.size() >= 3;
|
||||
|
||||
List<String> firstRow = rows.get(0);
|
||||
String firstRowStr = firstRow.stream().map(field -> (field == null) ? "" : field).collect(Collectors.joining(""));
|
||||
List<List<String>> otherRows = rows.subList(1, rows.size());
|
||||
List<String> otherRowStrs = new ArrayList<>();
|
||||
for (List<String> row : otherRows) {
|
||||
otherRowStrs.add(row.stream().map(str -> (str == null) ? "" : str).collect(Collectors.joining("")));
|
||||
}
|
||||
|
||||
// Check lengths
|
||||
|
||||
double firstRowLength = firstRowStr.length();
|
||||
DoubleSummaryStatistics otherRowStats = otherRowStrs.stream().mapToDouble(otherRow -> (double) otherRow.length())
|
||||
.collect(DoubleSummaryStatistics::new, DoubleSummaryStatistics::accept, DoubleSummaryStatistics::combine);
|
||||
|
||||
double otherLengthRange = otherRowStats.getMax() - otherRowStats.getMin();
|
||||
if (firstRowLength < otherRowStats.getMin() - otherLengthRange / 10.0 ||
|
||||
firstRowLength > otherRowStats.getMax() + otherLengthRange / 10.0) {
|
||||
explanation.add("First row is unusual based on length test: [" + firstRowLength + "] and [" +
|
||||
toNiceString(otherRowStats) + "]");
|
||||
return true;
|
||||
}
|
||||
|
||||
explanation.add("First row is not unusual based on length test: [" + firstRowLength + "] and [" +
|
||||
toNiceString(otherRowStats) + "]");
|
||||
|
||||
// Check edit distances
|
||||
|
||||
DoubleSummaryStatistics firstRowStats = otherRows.stream().limit(MAX_LEVENSHTEIN_COMPARISONS)
|
||||
.mapToDouble(otherRow -> (double) levenshteinFieldwiseCompareRows(firstRow, otherRow))
|
||||
.collect(DoubleSummaryStatistics::new, DoubleSummaryStatistics::accept, DoubleSummaryStatistics::combine);
|
||||
|
||||
otherRowStats = new DoubleSummaryStatistics();
|
||||
int numComparisons = 0;
|
||||
int proportion = otherRowStrs.size() / MAX_LEVENSHTEIN_COMPARISONS;
|
||||
int innerIncrement = 1 + proportion * proportion;
|
||||
Random random = new Random(firstRow.hashCode());
|
||||
for (int i = 0; numComparisons < MAX_LEVENSHTEIN_COMPARISONS && i < otherRowStrs.size(); ++i) {
|
||||
for (int j = i + 1 + random.nextInt(innerIncrement); numComparisons < MAX_LEVENSHTEIN_COMPARISONS && j < otherRowStrs.size();
|
||||
j += innerIncrement) {
|
||||
otherRowStats.accept((double) levenshteinFieldwiseCompareRows(otherRows.get(i), otherRows.get(j)));
|
||||
++numComparisons;
|
||||
}
|
||||
}
|
||||
|
||||
if (firstRowStats.getAverage() > otherRowStats.getAverage() * 1.2) {
|
||||
explanation.add("First row is unusual based on Levenshtein test [" + toNiceString(firstRowStats) +
|
||||
"] and [" + toNiceString(otherRowStats) + "]");
|
||||
return true;
|
||||
}
|
||||
|
||||
explanation.add("First row is not unusual based on Levenshtein test [" + toNiceString(firstRowStats) +
|
||||
"] and [" + toNiceString(otherRowStats) + "]");
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private static String toNiceString(DoubleSummaryStatistics stats) {
|
||||
return String.format(Locale.ROOT, "count=%d, min=%f, average=%f, max=%f", stats.getCount(), stats.getMin(), stats.getAverage(),
|
||||
stats.getMax());
|
||||
}
|
||||
|
||||
/**
|
||||
* Sum of the Levenshtein distances between corresponding elements
|
||||
* in the two supplied lists _excluding_ the biggest difference.
|
||||
* The reason the biggest difference is excluded is that sometimes
|
||||
* there's a "message" field that is much longer than any of the other
|
||||
* fields, varies enormously between rows, and skews the comparison.
|
||||
*/
|
||||
static int levenshteinFieldwiseCompareRows(List<String> firstRow, List<String> secondRow) {
|
||||
|
||||
int largestSize = Math.max(firstRow.size(), secondRow.size());
|
||||
if (largestSize <= 1) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int[] distances = new int[largestSize];
|
||||
|
||||
for (int index = 0; index < largestSize; ++index) {
|
||||
distances[index] = levenshteinDistance((index < firstRow.size()) ? firstRow.get(index) : "",
|
||||
(index < secondRow.size()) ? secondRow.get(index) : "");
|
||||
}
|
||||
|
||||
Arrays.sort(distances);
|
||||
|
||||
return IntStream.of(distances).limit(distances.length - 1).sum();
|
||||
}
|
||||
|
||||
/**
|
||||
* This method implements the simple algorithm for calculating Levenshtein distance.
|
||||
*/
|
||||
static int levenshteinDistance(String first, String second) {
|
||||
|
||||
// There are some examples with pretty pictures of the matrix on Wikipedia here:
|
||||
// http://en.wikipedia.org/wiki/Levenshtein_distance
|
||||
|
||||
int firstLen = (first == null) ? 0 : first.length();
|
||||
int secondLen = (second == null) ? 0 : second.length();
|
||||
if (firstLen == 0) {
|
||||
return secondLen;
|
||||
}
|
||||
if (secondLen == 0) {
|
||||
return firstLen;
|
||||
}
|
||||
|
||||
int[] currentCol = new int[secondLen + 1];
|
||||
int[] prevCol = new int[secondLen + 1];
|
||||
|
||||
// Populate the left column
|
||||
for (int down = 0; down <= secondLen; ++down) {
|
||||
currentCol[down] = down;
|
||||
}
|
||||
|
||||
// Calculate the other entries in the matrix
|
||||
for (int across = 1; across <= firstLen; ++across) {
|
||||
int[] tmp = prevCol;
|
||||
prevCol = currentCol;
|
||||
// We could allocate a new array for currentCol here, but it's more efficient to reuse the one that's now redundant
|
||||
currentCol = tmp;
|
||||
|
||||
currentCol[0] = across;
|
||||
|
||||
for (int down = 1; down <= secondLen; ++down) {
|
||||
|
||||
// Do the strings differ at the point we've reached?
|
||||
if (first.charAt(across - 1) == second.charAt(down - 1)) {
|
||||
|
||||
// No, they're the same => no extra cost
|
||||
currentCol[down] = prevCol[down - 1];
|
||||
} else {
|
||||
// Yes, they differ, so there are 3 options:
|
||||
|
||||
// 1) Deletion => cell to the left's value plus 1
|
||||
int option1 = prevCol[down];
|
||||
|
||||
// 2) Insertion => cell above's value plus 1
|
||||
int option2 = currentCol[down - 1];
|
||||
|
||||
// 3) Substitution => cell above left's value plus 1
|
||||
int option3 = prevCol[down - 1];
|
||||
|
||||
// Take the cheapest option of the 3
|
||||
currentCol[down] = Math.min(Math.min(option1, option2), option3) + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Result is the value in the bottom right hand corner of the matrix
|
||||
return currentCol[secondLen];
|
||||
}
|
||||
|
||||
static boolean lineHasUnescapedQuote(String line, CsvPreference csvPreference) {
|
||||
char quote = csvPreference.getQuoteChar();
|
||||
String lineWithEscapedQuotesRemoved = line.replace(String.valueOf(quote) + quote, "");
|
||||
for (int index = 1; index < lineWithEscapedQuotesRemoved.length() - 1; ++index) {
|
||||
if (lineWithEscapedQuotesRemoved.charAt(index) == quote &&
|
||||
lineWithEscapedQuotesRemoved.codePointAt(index - 1) != csvPreference.getDelimiterChar() &&
|
||||
lineWithEscapedQuotesRemoved.codePointAt(index + 1) != csvPreference.getDelimiterChar()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static boolean canCreateFromSample(List<String> explanation, String sample, int minFieldsPerRow, CsvPreference csvPreference,
|
||||
String formatName) {
|
||||
|
||||
// Logstash's CSV parser won't tolerate fields where just part of the
|
||||
// value is quoted, whereas SuperCSV will, hence this extra check
|
||||
String[] sampleLines = sample.split("\n");
|
||||
for (String sampleLine : sampleLines) {
|
||||
if (lineHasUnescapedQuote(sampleLine, csvPreference)) {
|
||||
explanation.add("Not " + formatName +
|
||||
" because a line has an unescaped quote that is not at the beginning or end of a field: [" + sampleLine + "]");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
try (CsvListReader csvReader = new CsvListReader(new StringReader(sample), csvPreference)) {
|
||||
|
||||
int fieldsInFirstRow = -1;
|
||||
int fieldsInLastRow = -1;
|
||||
|
||||
int numberOfRows = 0;
|
||||
try {
|
||||
List<String> row;
|
||||
while ((row = csvReader.read()) != null) {
|
||||
|
||||
int fieldsInThisRow = row.size();
|
||||
++numberOfRows;
|
||||
if (fieldsInFirstRow < 0) {
|
||||
fieldsInFirstRow = fieldsInThisRow;
|
||||
if (fieldsInFirstRow < minFieldsPerRow) {
|
||||
explanation.add("Not " + formatName + " because the first row has fewer than [" + minFieldsPerRow +
|
||||
"] fields: [" + fieldsInFirstRow + "]");
|
||||
return false;
|
||||
}
|
||||
fieldsInLastRow = fieldsInFirstRow;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Tolerate extra columns if and only if they're empty
|
||||
while (fieldsInThisRow > fieldsInFirstRow && row.get(fieldsInThisRow - 1) == null) {
|
||||
--fieldsInThisRow;
|
||||
}
|
||||
|
||||
if (fieldsInLastRow != fieldsInFirstRow) {
|
||||
explanation.add("Not " + formatName + " because row [" + (numberOfRows - 1) +
|
||||
"] has a different number of fields to the first row: [" + fieldsInFirstRow + "] and [" +
|
||||
fieldsInLastRow + "]");
|
||||
return false;
|
||||
}
|
||||
|
||||
fieldsInLastRow = fieldsInThisRow;
|
||||
}
|
||||
|
||||
if (fieldsInLastRow > fieldsInFirstRow) {
|
||||
explanation.add("Not " + formatName + " because last row has more fields than first row: [" + fieldsInFirstRow +
|
||||
"] and [" + fieldsInLastRow + "]");
|
||||
return false;
|
||||
}
|
||||
if (fieldsInLastRow < fieldsInFirstRow) {
|
||||
--numberOfRows;
|
||||
}
|
||||
} catch (SuperCsvException e) {
|
||||
// Tolerate an incomplete last row
|
||||
if (notUnexpectedEndOfFile(e)) {
|
||||
explanation.add("Not " + formatName + " because there was a parsing exception: [" + e.getMessage() + "]");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (numberOfRows <= 1) {
|
||||
explanation.add("Not " + formatName + " because fewer than 2 complete records in sample: [" + numberOfRows + "]");
|
||||
return false;
|
||||
}
|
||||
explanation.add("Deciding sample is " + formatName);
|
||||
return true;
|
||||
|
||||
} catch (IOException e) {
|
||||
explanation.add("Not " + formatName + " because there was a parsing exception: [" + e.getMessage() + "]");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean notUnexpectedEndOfFile(SuperCsvException e) {
|
||||
return e.getMessage().startsWith("unexpected end of file while reading quoted column") == false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,201 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class TextLogStructureFinder implements LogStructureFinder {
|
||||
|
||||
private final List<String> sampleMessages;
|
||||
private final LogStructure structure;
|
||||
|
||||
static TextLogStructureFinder makeTextLogStructureFinder(List<String> explanation, String sample, String charsetName,
|
||||
Boolean hasByteOrderMarker) {
|
||||
|
||||
String[] sampleLines = sample.split("\n");
|
||||
Tuple<TimestampMatch, Set<String>> bestTimestamp = mostLikelyTimestamp(sampleLines);
|
||||
if (bestTimestamp == null) {
|
||||
// Is it appropriate to treat a file that is neither structured nor has
|
||||
// a regular pattern of timestamps as a log file? Probably not...
|
||||
throw new IllegalArgumentException("Could not find a timestamp in the log sample provided");
|
||||
}
|
||||
|
||||
explanation.add("Most likely timestamp format is [" + bestTimestamp.v1() + "]");
|
||||
|
||||
List<String> sampleMessages = new ArrayList<>();
|
||||
StringBuilder preamble = new StringBuilder();
|
||||
int linesConsumed = 0;
|
||||
StringBuilder message = null;
|
||||
int linesInMessage = 0;
|
||||
String multiLineRegex = createMultiLineMessageStartRegex(bestTimestamp.v2(), bestTimestamp.v1().simplePattern.pattern());
|
||||
Pattern multiLinePattern = Pattern.compile(multiLineRegex);
|
||||
for (String sampleLine : sampleLines) {
|
||||
if (multiLinePattern.matcher(sampleLine).find()) {
|
||||
if (message != null) {
|
||||
sampleMessages.add(message.toString());
|
||||
linesConsumed += linesInMessage;
|
||||
}
|
||||
message = new StringBuilder(sampleLine);
|
||||
linesInMessage = 1;
|
||||
} else {
|
||||
// If message is null here then the sample probably began with the incomplete ending of a previous message
|
||||
if (message == null) {
|
||||
// We count lines before the first message as consumed (just like we would
|
||||
// for the CSV header or lines before the first XML document starts)
|
||||
++linesConsumed;
|
||||
} else {
|
||||
message.append('\n').append(sampleLine);
|
||||
++linesInMessage;
|
||||
}
|
||||
}
|
||||
if (sampleMessages.size() < 2) {
|
||||
preamble.append(sampleLine).append('\n');
|
||||
}
|
||||
}
|
||||
// Don't add the last message, as it might be partial and mess up subsequent pattern finding
|
||||
|
||||
LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.SEMI_STRUCTURED_TEXT)
|
||||
.setCharset(charsetName)
|
||||
.setHasByteOrderMarker(hasByteOrderMarker)
|
||||
.setSampleStart(preamble.toString())
|
||||
.setNumLinesAnalyzed(linesConsumed)
|
||||
.setNumMessagesAnalyzed(sampleMessages.size())
|
||||
.setMultilineStartPattern(multiLineRegex);
|
||||
|
||||
SortedMap<String, Object> mappings = new TreeMap<>();
|
||||
mappings.put("message", Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "text"));
|
||||
mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
|
||||
|
||||
// We can't parse directly into @timestamp using Grok, so parse to some other time field, which the date filter will then remove
|
||||
String interimTimestampField;
|
||||
String grokPattern;
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
|
||||
Tuple<String, String> timestampFieldAndFullMatchGrokPattern = grokPatternCreator.findFullLineGrokPattern();
|
||||
if (timestampFieldAndFullMatchGrokPattern != null) {
|
||||
interimTimestampField = timestampFieldAndFullMatchGrokPattern.v1();
|
||||
grokPattern = timestampFieldAndFullMatchGrokPattern.v2();
|
||||
} else {
|
||||
interimTimestampField = "timestamp";
|
||||
grokPattern = grokPatternCreator.createGrokPatternFromExamples(bestTimestamp.v1().grokPatternName, interimTimestampField);
|
||||
}
|
||||
|
||||
LogStructure structure = structureBuilder
|
||||
.setTimestampField(interimTimestampField)
|
||||
.setTimestampFormats(bestTimestamp.v1().dateFormats)
|
||||
.setNeedClientTimezone(bestTimestamp.v1().hasTimezoneDependentParsing())
|
||||
.setGrokPattern(grokPattern)
|
||||
.setMappings(mappings)
|
||||
.setExplanation(explanation)
|
||||
.build();
|
||||
|
||||
return new TextLogStructureFinder(sampleMessages, structure);
|
||||
}
|
||||
|
||||
private TextLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
|
||||
this.sampleMessages = Collections.unmodifiableList(sampleMessages);
|
||||
this.structure = structure;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getSampleMessages() {
|
||||
return sampleMessages;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LogStructure getStructure() {
|
||||
return structure;
|
||||
}
|
||||
|
||||
static Tuple<TimestampMatch, Set<String>> mostLikelyTimestamp(String[] sampleLines) {
|
||||
|
||||
Map<TimestampMatch, Tuple<Double, Set<String>>> timestampMatches = new LinkedHashMap<>();
|
||||
|
||||
int remainingLines = sampleLines.length;
|
||||
double differenceBetweenTwoHighestWeights = 0.0;
|
||||
for (String sampleLine : sampleLines) {
|
||||
TimestampMatch match = TimestampFormatFinder.findFirstMatch(sampleLine);
|
||||
if (match != null) {
|
||||
TimestampMatch pureMatch = new TimestampMatch(match.candidateIndex, "", match.dateFormats, match.simplePattern,
|
||||
match.grokPatternName, "");
|
||||
timestampMatches.compute(pureMatch, (k, v) -> {
|
||||
if (v == null) {
|
||||
return new Tuple<>(weightForMatch(match.preface), new HashSet<>(Collections.singletonList(match.preface)));
|
||||
} else {
|
||||
v.v2().add(match.preface);
|
||||
return new Tuple<>(v.v1() + weightForMatch(match.preface), v.v2());
|
||||
}
|
||||
});
|
||||
differenceBetweenTwoHighestWeights = findDifferenceBetweenTwoHighestWeights(timestampMatches.values());
|
||||
}
|
||||
// The highest possible weight is 1, so if the difference between the two highest weights
|
||||
// is less than the number of lines remaining then the leader cannot possibly be overtaken
|
||||
if (differenceBetweenTwoHighestWeights > --remainingLines) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
double highestWeight = 0.0;
|
||||
Tuple<TimestampMatch, Set<String>> highestWeightMatch = null;
|
||||
for (Map.Entry<TimestampMatch, Tuple<Double, Set<String>>> entry : timestampMatches.entrySet()) {
|
||||
double weight = entry.getValue().v1();
|
||||
if (weight > highestWeight) {
|
||||
highestWeight = weight;
|
||||
highestWeightMatch = new Tuple<>(entry.getKey(), entry.getValue().v2());
|
||||
}
|
||||
}
|
||||
return highestWeightMatch;
|
||||
}
|
||||
|
||||
/**
|
||||
* Used to weight a timestamp match according to how far along the line it is found.
|
||||
* Timestamps at the very beginning of the line are given a weight of 1. The weight
|
||||
* progressively decreases the more text there is preceding the timestamp match, but
|
||||
* is always greater than 0.
|
||||
* @return A weight in the range (0, 1].
|
||||
*/
|
||||
private static double weightForMatch(String preface) {
|
||||
return Math.pow(1.0 + preface.length() / 15.0, -1.1);
|
||||
}
|
||||
|
||||
private static double findDifferenceBetweenTwoHighestWeights(Collection<Tuple<Double, Set<String>>> timestampMatches) {
|
||||
double highestWeight = 0.0;
|
||||
double secondHighestWeight = 0.0;
|
||||
for (Tuple<Double, Set<String>> timestampMatch : timestampMatches) {
|
||||
double weight = timestampMatch.v1();
|
||||
if (weight > highestWeight) {
|
||||
secondHighestWeight = highestWeight;
|
||||
highestWeight = weight;
|
||||
} else if (weight > secondHighestWeight) {
|
||||
secondHighestWeight = weight;
|
||||
}
|
||||
}
|
||||
return highestWeight - secondHighestWeight;
|
||||
}
|
||||
|
||||
static String createMultiLineMessageStartRegex(Collection<String> prefaces, String timestampRegex) {
|
||||
|
||||
StringBuilder builder = new StringBuilder("^");
|
||||
GrokPatternCreator.addIntermediateRegex(builder, prefaces);
|
||||
builder.append(timestampRegex);
|
||||
if (builder.substring(0, 3).equals("^\\b")) {
|
||||
builder.delete(1, 3);
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class TextLogStructureFinderFactory implements LogStructureFinderFactory {
|
||||
|
||||
// This works because, by default, dot doesn't match newlines
|
||||
private static final Pattern TWO_NON_BLANK_LINES_PATTERN = Pattern.compile(".\n+.");
|
||||
|
||||
/**
|
||||
* This format matches if the sample contains at least one newline and at least two
|
||||
* non-blank lines.
|
||||
*/
|
||||
@Override
|
||||
public boolean canCreateFromSample(List<String> explanation, String sample) {
|
||||
if (sample.indexOf('\n') < 0) {
|
||||
explanation.add("Not text because sample contains no newlines");
|
||||
return false;
|
||||
}
|
||||
if (TWO_NON_BLANK_LINES_PATTERN.matcher(sample).find() == false) {
|
||||
explanation.add("Not text because sample contains fewer than two non-blank lines");
|
||||
return false;
|
||||
}
|
||||
|
||||
explanation.add("Deciding sample is text");
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker) {
|
||||
return TextLogStructureFinder.makeTextLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,427 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.grok.Grok;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* Used to find the best timestamp format for one of the following situations:
|
||||
* 1. Matching an entire field value
|
||||
* 2. Matching a timestamp found somewhere within a message
|
||||
*/
|
||||
public final class TimestampFormatFinder {
|
||||
|
||||
private static final String PREFACE = "preface";
|
||||
private static final String EPILOGUE = "epilogue";
|
||||
|
||||
private static final Pattern FRACTIONAL_SECOND_INTERPRETER = Pattern.compile("([:.,])(\\d{3,9})");
|
||||
private static final char DEFAULT_FRACTIONAL_SECOND_SEPARATOR = ',';
|
||||
|
||||
/**
|
||||
* The timestamp patterns are complex and it can be slow to prove they do not
|
||||
* match anywhere in a long message. Many of the timestamps are similar and
|
||||
* will never be found in a string if simpler sub-patterns do not exist in the
|
||||
* string. These sub-patterns can be used to quickly rule out multiple complex
|
||||
* patterns. These patterns do not need to represent quantities that are
|
||||
* useful to know the value of, merely character sequences that can be used to
|
||||
* prove that <em>several</em> more complex patterns cannot possibly match.
|
||||
*/
|
||||
private static final List<Pattern> QUICK_RULE_OUT_PATTERNS = Arrays.asList(
|
||||
// YYYY-MM-dd followed by a space
|
||||
Pattern.compile("\\b\\d{4}-\\d{2}-\\d{2} "),
|
||||
// The end of some number (likely year or day) followed by a space then HH:mm
|
||||
Pattern.compile("\\d \\d{2}:\\d{2}\\b"),
|
||||
// HH:mm:ss surrounded by spaces
|
||||
Pattern.compile(" \\d{2}:\\d{2}:\\d{2} ")
|
||||
);
|
||||
|
||||
/**
|
||||
* The first match in this list will be chosen, so it needs to be ordered
|
||||
* such that more generic patterns come after more specific patterns.
|
||||
*/
|
||||
static final List<CandidateTimestampFormat> ORDERED_CANDIDATE_FORMATS = Arrays.asList(
|
||||
// The TOMCAT_DATESTAMP format has to come before ISO8601 because it's basically ISO8601 but
|
||||
// with a space before the timezone, and because the timezone is optional in ISO8601 it will
|
||||
// be recognised as that with the timezone missed off if ISO8601 is checked first
|
||||
new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSS Z", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
|
||||
"\\b20\\d{2}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9} (?:Z|[+-]%{HOUR}%{MINUTE})\\b",
|
||||
"TOMCAT_DATESTAMP", Arrays.asList(0, 1)),
|
||||
// The Elasticsearch ISO8601 parser requires a literal T between the date and time, so
|
||||
// longhand formats are needed if there's a space instead
|
||||
new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSSZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
|
||||
"\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}(?:Z|[+-]%{HOUR}%{MINUTE})\\b",
|
||||
"TIMESTAMP_ISO8601", Arrays.asList(0, 1)),
|
||||
new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSSZZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
|
||||
"\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}[+-]%{HOUR}:%{MINUTE}\\b",
|
||||
"TIMESTAMP_ISO8601", Arrays.asList(0, 1)),
|
||||
new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSS", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
|
||||
"\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}\\b", "TIMESTAMP_ISO8601",
|
||||
Arrays.asList(0, 1)),
|
||||
new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ssZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
|
||||
"\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)(?:Z|[+-]%{HOUR}%{MINUTE})\\b", "TIMESTAMP_ISO8601",
|
||||
Arrays.asList(0, 1)),
|
||||
new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ssZZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
|
||||
"\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[+-]%{HOUR}:%{MINUTE}\\b", "TIMESTAMP_ISO8601",
|
||||
Arrays.asList(0, 1)),
|
||||
new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
|
||||
"\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)\\b", "TIMESTAMP_ISO8601",
|
||||
Arrays.asList(0, 1)),
|
||||
new CandidateTimestampFormat("ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "\\b%{TIMESTAMP_ISO8601}\\b",
|
||||
"TIMESTAMP_ISO8601"),
|
||||
new CandidateTimestampFormat("EEE MMM dd YYYY HH:mm:ss zzz",
|
||||
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2} ",
|
||||
"\\b%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{TZ}\\b", "DATESTAMP_RFC822", Arrays.asList(1, 2)),
|
||||
new CandidateTimestampFormat("EEE MMM dd YYYY HH:mm zzz", "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2} ",
|
||||
"\\b%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE} %{TZ}\\b", "DATESTAMP_RFC822", Collections.singletonList(1)),
|
||||
new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm:ss ZZ",
|
||||
"\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ",
|
||||
"\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:Z|[+-]%{HOUR}:%{MINUTE})\\b",
|
||||
"DATESTAMP_RFC2822", Arrays.asList(1, 2)),
|
||||
new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm:ss Z",
|
||||
"\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ",
|
||||
"\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:Z|[+-]%{HOUR}%{MINUTE})\\b",
|
||||
"DATESTAMP_RFC2822", Arrays.asList(1, 2)),
|
||||
new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm ZZ", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ",
|
||||
"\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE} (?:Z|[+-]%{HOUR}:%{MINUTE})\\b", "DATESTAMP_RFC2822",
|
||||
Collections.singletonList(1)),
|
||||
new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm Z", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ",
|
||||
"\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE} (?:Z|[+-]%{HOUR}%{MINUTE})\\b", "DATESTAMP_RFC2822",
|
||||
Collections.singletonList(1)),
|
||||
new CandidateTimestampFormat("EEE MMM dd HH:mm:ss zzz YYYY",
|
||||
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b",
|
||||
"\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{TZ} %{YEAR}\\b", "DATESTAMP_OTHER",
|
||||
Arrays.asList(1, 2)),
|
||||
new CandidateTimestampFormat("EEE MMM dd HH:mm zzz YYYY",
|
||||
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b",
|
||||
"\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE} %{TZ} %{YEAR}\\b", "DATESTAMP_OTHER", Collections.singletonList(1)),
|
||||
new CandidateTimestampFormat("YYYYMMddHHmmss", "\\b\\d{14}\\b",
|
||||
"\\b20\\d{2}%{MONTHNUM2}(?:(?:0[1-9])|(?:[12][0-9])|(?:3[01]))(?:2[0123]|[01][0-9])%{MINUTE}(?:[0-5][0-9]|60)\\b",
|
||||
"DATESTAMP_EVENTLOG"),
|
||||
new CandidateTimestampFormat("EEE MMM dd HH:mm:ss YYYY",
|
||||
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b",
|
||||
"\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{YEAR}\\b", "HTTPDERROR_DATE", Arrays.asList(1, 2)),
|
||||
new CandidateTimestampFormat(Arrays.asList("MMM dd HH:mm:ss,SSS", "MMM d HH:mm:ss,SSS"),
|
||||
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
|
||||
"%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}\\b", "SYSLOGTIMESTAMP",
|
||||
Collections.singletonList(1)),
|
||||
new CandidateTimestampFormat(Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"),
|
||||
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b",
|
||||
"SYSLOGTIMESTAMP", Collections.singletonList(1)),
|
||||
new CandidateTimestampFormat("dd/MMM/YYYY:HH:mm:ss Z", "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ",
|
||||
"\\b%{MONTHDAY}/%{MONTH}/%{YEAR}:%{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) [+-]?%{HOUR}%{MINUTE}\\b", "HTTPDATE"),
|
||||
new CandidateTimestampFormat("MMM dd, YYYY K:mm:ss a", "\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b",
|
||||
"%{MONTH} %{MONTHDAY}, 20\\d{2} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:AM|PM)\\b", "CATALINA_DATESTAMP"),
|
||||
new CandidateTimestampFormat(Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"),
|
||||
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b",
|
||||
"%{MONTH} +%{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b", "CISCOTIMESTAMP", Collections.singletonList(1)),
|
||||
new CandidateTimestampFormat("UNIX_MS", "\\b\\d{13}\\b", "\\b\\d{13}\\b", "POSINT"),
|
||||
new CandidateTimestampFormat("UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "\\b\\d{10}\\.(?:\\d{3}){1,3}\\b", "NUMBER"),
|
||||
new CandidateTimestampFormat("UNIX", "\\b\\d{10}\\b", "\\b\\d{10}\\b", "POSINT"),
|
||||
new CandidateTimestampFormat("TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM")
|
||||
);
|
||||
|
||||
private TimestampFormatFinder() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the first timestamp format that matches part of the supplied value.
|
||||
* @param text The value that the returned timestamp format must exist within.
|
||||
* @return The timestamp format, or <code>null</code> if none matches.
|
||||
*/
|
||||
public static TimestampMatch findFirstMatch(String text) {
|
||||
return findFirstMatch(text, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the first timestamp format that matches part of the supplied value,
|
||||
* excluding a specified number of candidate formats.
|
||||
* @param text The value that the returned timestamp format must exist within.
|
||||
* @param ignoreCandidates The number of candidate formats to exclude from the search.
|
||||
* @return The timestamp format, or <code>null</code> if none matches.
|
||||
*/
|
||||
public static TimestampMatch findFirstMatch(String text, int ignoreCandidates) {
|
||||
Boolean[] quickRuleoutMatches = new Boolean[QUICK_RULE_OUT_PATTERNS.size()];
|
||||
int index = ignoreCandidates;
|
||||
for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) {
|
||||
boolean quicklyRuledOut = false;
|
||||
for (Integer quickRuleOutIndex : candidate.quickRuleOutIndices) {
|
||||
if (quickRuleoutMatches[quickRuleOutIndex] == null) {
|
||||
quickRuleoutMatches[quickRuleOutIndex] = QUICK_RULE_OUT_PATTERNS.get(quickRuleOutIndex).matcher(text).find();
|
||||
}
|
||||
if (quickRuleoutMatches[quickRuleOutIndex] == false) {
|
||||
quicklyRuledOut = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (quicklyRuledOut == false) {
|
||||
Map<String, Object> captures = candidate.strictSearchGrok.captures(text);
|
||||
if (captures != null) {
|
||||
String preface = captures.getOrDefault(PREFACE, "").toString();
|
||||
String epilogue = captures.getOrDefault(EPILOGUE, "").toString();
|
||||
return makeTimestampMatch(candidate, index, preface, text.substring(preface.length(),
|
||||
text.length() - epilogue.length()), epilogue);
|
||||
}
|
||||
}
|
||||
++index;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the best timestamp format for matching an entire field value.
|
||||
* @param text The value that the returned timestamp format must match in its entirety.
|
||||
* @return The timestamp format, or <code>null</code> if none matches.
|
||||
*/
|
||||
public static TimestampMatch findFirstFullMatch(String text) {
|
||||
return findFirstFullMatch(text, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the best timestamp format for matching an entire field value,
|
||||
* excluding a specified number of candidate formats.
|
||||
* @param text The value that the returned timestamp format must match in its entirety.
|
||||
* @param ignoreCandidates The number of candidate formats to exclude from the search.
|
||||
* @return The timestamp format, or <code>null</code> if none matches.
|
||||
*/
|
||||
public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidates) {
|
||||
int index = ignoreCandidates;
|
||||
for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) {
|
||||
Map<String, Object> captures = candidate.strictFullMatchGrok.captures(text);
|
||||
if (captures != null) {
|
||||
return makeTimestampMatch(candidate, index, "", text, "");
|
||||
}
|
||||
++index;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static TimestampMatch makeTimestampMatch(CandidateTimestampFormat chosenTimestampFormat, int chosenIndex,
|
||||
String preface, String matchedDate, String epilogue) {
|
||||
Tuple<Character, Integer> fractionalSecondsInterpretation = interpretFractionalSeconds(matchedDate);
|
||||
List<String> dateFormats = chosenTimestampFormat.dateFormats;
|
||||
Pattern simplePattern = chosenTimestampFormat.simplePattern;
|
||||
char separator = fractionalSecondsInterpretation.v1();
|
||||
if (separator != DEFAULT_FRACTIONAL_SECOND_SEPARATOR) {
|
||||
dateFormats = dateFormats.stream().map(dateFormat -> dateFormat.replace(DEFAULT_FRACTIONAL_SECOND_SEPARATOR, separator))
|
||||
.collect(Collectors.toList());
|
||||
if (dateFormats.stream().noneMatch(dateFormat -> dateFormat.startsWith("UNIX"))) {
|
||||
String patternStr = simplePattern.pattern();
|
||||
int separatorPos = patternStr.lastIndexOf(DEFAULT_FRACTIONAL_SECOND_SEPARATOR);
|
||||
if (separatorPos >= 0) {
|
||||
StringBuilder newPatternStr = new StringBuilder(patternStr);
|
||||
newPatternStr.replace(separatorPos, separatorPos + 1, ((separator == '.') ? "\\" : "") + separator);
|
||||
simplePattern = Pattern.compile(newPatternStr.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
int numberOfDigitsInFractionalComponent = fractionalSecondsInterpretation.v2();
|
||||
if (numberOfDigitsInFractionalComponent > 3) {
|
||||
String fractionalSecondsFormat = "SSSSSSSSS".substring(0, numberOfDigitsInFractionalComponent);
|
||||
dateFormats = dateFormats.stream().map(dateFormat -> dateFormat.replace("SSS", fractionalSecondsFormat))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
return new TimestampMatch(chosenIndex, preface, dateFormats, simplePattern, chosenTimestampFormat.standardGrokPatternName,
|
||||
epilogue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Interpret the fractional seconds component of a date to determine two things:
|
||||
* 1. The separator character - one of colon, comma and dot.
|
||||
* 2. The number of digits in the fractional component.
|
||||
* @param date The textual representation of the date for which fractional seconds are to be interpreted.
|
||||
* @return A tuple of (fractional second separator character, number of digits in fractional component).
|
||||
*/
|
||||
static Tuple<Character, Integer> interpretFractionalSeconds(String date) {
|
||||
|
||||
Matcher matcher = FRACTIONAL_SECOND_INTERPRETER.matcher(date);
|
||||
if (matcher.find()) {
|
||||
return new Tuple<>(matcher.group(1).charAt(0), matcher.group(2).length());
|
||||
}
|
||||
|
||||
return new Tuple<>(DEFAULT_FRACTIONAL_SECOND_SEPARATOR, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents a timestamp that has matched a field value or been found within a message.
|
||||
*/
|
||||
public static final class TimestampMatch {
|
||||
|
||||
/**
|
||||
* The index of the corresponding entry in the <code>ORDERED_CANDIDATE_FORMATS</code> list.
|
||||
*/
|
||||
public final int candidateIndex;
|
||||
|
||||
/**
|
||||
* Text that came before the timestamp in the matched field/message.
|
||||
*/
|
||||
public final String preface;
|
||||
|
||||
/**
|
||||
* Time format specifier(s) that will work with Logstash and Ingest pipeline date parsers.
|
||||
*/
|
||||
public final List<String> dateFormats;
|
||||
|
||||
/**
|
||||
* A simple regex that will work in many languages to detect whether the timestamp format
|
||||
* exists in a particular line.
|
||||
*/
|
||||
public final Pattern simplePattern;
|
||||
|
||||
/**
|
||||
* Name of an out-of-the-box Grok pattern that will match the timestamp.
|
||||
*/
|
||||
public final String grokPatternName;
|
||||
|
||||
/**
|
||||
* Text that came after the timestamp in the matched field/message.
|
||||
*/
|
||||
public final String epilogue;
|
||||
|
||||
TimestampMatch(int candidateIndex, String preface, String dateFormat, String simpleRegex, String grokPatternName, String epilogue) {
|
||||
this(candidateIndex, preface, Collections.singletonList(dateFormat), simpleRegex, grokPatternName, epilogue);
|
||||
}
|
||||
|
||||
TimestampMatch(int candidateIndex, String preface, String dateFormat, String simpleRegex, String grokPatternName, String epilogue,
|
||||
boolean hasFractionalComponentSmallerThanMillisecond) {
|
||||
this(candidateIndex, preface, Collections.singletonList(dateFormat), simpleRegex, grokPatternName, epilogue);
|
||||
}
|
||||
|
||||
TimestampMatch(int candidateIndex, String preface, List<String> dateFormats, String simpleRegex, String grokPatternName,
|
||||
String epilogue) {
|
||||
this(candidateIndex, preface, dateFormats, Pattern.compile(simpleRegex), grokPatternName, epilogue);
|
||||
}
|
||||
|
||||
TimestampMatch(int candidateIndex, String preface, List<String> dateFormats, Pattern simplePattern, String grokPatternName,
|
||||
String epilogue) {
|
||||
this.candidateIndex = candidateIndex;
|
||||
this.preface = preface;
|
||||
this.dateFormats = dateFormats;
|
||||
this.simplePattern = simplePattern;
|
||||
this.grokPatternName = grokPatternName;
|
||||
this.epilogue = epilogue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the parsing the timestamp produce different results depending on the timezone of the parser?
|
||||
* I.e., does the textual representation NOT define the timezone?
|
||||
*/
|
||||
public boolean hasTimezoneDependentParsing() {
|
||||
return dateFormats.stream()
|
||||
.anyMatch(dateFormat -> dateFormat.contains("HH") && dateFormat.toLowerCase(Locale.ROOT).indexOf('z') == -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sometimes Elasticsearch mappings for dates need to include the format.
|
||||
* This method returns appropriate mappings settings: at minimum "type"="date",
|
||||
* and possibly also a "format" setting.
|
||||
*/
|
||||
public Map<String, String> getEsDateMappingTypeWithFormat() {
|
||||
if (dateFormats.contains("TAI64N")) {
|
||||
// There's no format for TAI64N in the date formats used in mappings
|
||||
return Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword");
|
||||
}
|
||||
Map<String, String> mapping = new LinkedHashMap<>();
|
||||
mapping.put(LogStructureUtils.MAPPING_TYPE_SETTING, "date");
|
||||
String formats = dateFormats.stream().flatMap(format -> {
|
||||
switch (format) {
|
||||
case "ISO8601":
|
||||
return Stream.empty();
|
||||
case "UNIX_MS":
|
||||
return Stream.of("epoch_millis");
|
||||
case "UNIX":
|
||||
return Stream.of("epoch_second");
|
||||
default:
|
||||
return Stream.of(format);
|
||||
}
|
||||
}).collect(Collectors.joining("||"));
|
||||
if (formats.isEmpty() == false) {
|
||||
mapping.put(LogStructureUtils.MAPPING_FORMAT_SETTING, formats);
|
||||
}
|
||||
return mapping;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(candidateIndex, preface, dateFormats, simplePattern.pattern(), grokPatternName, epilogue);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (other == null || getClass() != other.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
TimestampMatch that = (TimestampMatch) other;
|
||||
return this.candidateIndex == that.candidateIndex &&
|
||||
Objects.equals(this.preface, that.preface) &&
|
||||
Objects.equals(this.dateFormats, that.dateFormats) &&
|
||||
Objects.equals(this.simplePattern.pattern(), that.simplePattern.pattern()) &&
|
||||
Objects.equals(this.grokPatternName, that.grokPatternName) &&
|
||||
Objects.equals(this.epilogue, that.epilogue);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "index = " + candidateIndex + (preface.isEmpty() ? "" : ", preface = '" + preface + "'") +
|
||||
", date formats = " + dateFormats.stream().collect(Collectors.joining("', '", "[ '", "' ]")) +
|
||||
", simple pattern = '" + simplePattern.pattern() + "', grok pattern = '" + grokPatternName + "'" +
|
||||
(epilogue.isEmpty() ? "" : ", epilogue = '" + epilogue + "'");
|
||||
}
|
||||
}
|
||||
|
||||
static final class CandidateTimestampFormat {
|
||||
|
||||
final List<String> dateFormats;
|
||||
final Pattern simplePattern;
|
||||
final Grok strictSearchGrok;
|
||||
final Grok strictFullMatchGrok;
|
||||
final String standardGrokPatternName;
|
||||
final List<Integer> quickRuleOutIndices;
|
||||
|
||||
CandidateTimestampFormat(String dateFormat, String simpleRegex, String strictGrokPattern, String standardGrokPatternName) {
|
||||
this(Collections.singletonList(dateFormat), simpleRegex, strictGrokPattern, standardGrokPatternName);
|
||||
}
|
||||
|
||||
CandidateTimestampFormat(String dateFormat, String simpleRegex, String strictGrokPattern, String standardGrokPatternName,
|
||||
List<Integer> quickRuleOutIndices) {
|
||||
this(Collections.singletonList(dateFormat), simpleRegex, strictGrokPattern, standardGrokPatternName, quickRuleOutIndices);
|
||||
}
|
||||
|
||||
CandidateTimestampFormat(List<String> dateFormats, String simpleRegex, String strictGrokPattern, String standardGrokPatternName) {
|
||||
this(dateFormats, simpleRegex, strictGrokPattern, standardGrokPatternName, Collections.emptyList());
|
||||
}
|
||||
|
||||
CandidateTimestampFormat(List<String> dateFormats, String simpleRegex, String strictGrokPattern, String standardGrokPatternName,
|
||||
List<Integer> quickRuleOutIndices) {
|
||||
this.dateFormats = dateFormats;
|
||||
this.simplePattern = Pattern.compile(simpleRegex, Pattern.MULTILINE);
|
||||
// The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java
|
||||
this.strictSearchGrok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + strictGrokPattern +
|
||||
"%{GREEDYDATA:" + EPILOGUE + "}");
|
||||
this.strictFullMatchGrok = new Grok(Grok.getBuiltinPatterns(), strictGrokPattern);
|
||||
this.standardGrokPatternName = standardGrokPatternName;
|
||||
assert quickRuleOutIndices.stream()
|
||||
.noneMatch(quickRuleOutIndex -> quickRuleOutIndex < 0 || quickRuleOutIndex >= QUICK_RULE_OUT_PATTERNS.size());
|
||||
this.quickRuleOutIndices = quickRuleOutIndices;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.supercsv.prefs.CsvPreference;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public class TsvLogStructureFinderFactory implements LogStructureFinderFactory {
|
||||
|
||||
/**
|
||||
* Rules are:
|
||||
* - The file must be valid TSV
|
||||
* - It must contain at least two complete records
|
||||
* - There must be at least two fields per record (otherwise files with no tabs could be treated as TSV!)
|
||||
* - Every TSV record except the last must have the same number of fields
|
||||
* The reason the last record is allowed to have fewer fields than the others is that
|
||||
* it could have been truncated when the file was sampled.
|
||||
*/
|
||||
@Override
|
||||
public boolean canCreateFromSample(List<String> explanation, String sample) {
|
||||
return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 2, CsvPreference.TAB_PREFERENCE, "TSV");
|
||||
}
|
||||
|
||||
@Override
|
||||
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
|
||||
throws IOException {
|
||||
return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
|
||||
CsvPreference.TAB_PREFERENCE, false);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,172 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.NamedNodeMap;
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Scanner;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class XmlLogStructureFinder implements LogStructureFinder {
|
||||
|
||||
private final List<String> sampleMessages;
|
||||
private final LogStructure structure;
|
||||
|
||||
static XmlLogStructureFinder makeXmlLogStructureFinder(List<String> explanation, String sample, String charsetName,
|
||||
Boolean hasByteOrderMarker)
|
||||
throws IOException, ParserConfigurationException, SAXException {
|
||||
|
||||
String messagePrefix;
|
||||
try (Scanner scanner = new Scanner(sample)) {
|
||||
messagePrefix = scanner.next();
|
||||
}
|
||||
|
||||
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
|
||||
docBuilderFactory.setNamespaceAware(false);
|
||||
docBuilderFactory.setValidating(false);
|
||||
|
||||
List<String> sampleMessages = new ArrayList<>();
|
||||
List<Map<String, ?>> sampleRecords = new ArrayList<>();
|
||||
|
||||
String[] sampleDocEnds = sample.split(Pattern.quote(messagePrefix));
|
||||
StringBuilder preamble = new StringBuilder(sampleDocEnds[0]);
|
||||
int linesConsumed = numNewlinesIn(sampleDocEnds[0]);
|
||||
for (int i = 1; i < sampleDocEnds.length; ++i) {
|
||||
String sampleDoc = messagePrefix + sampleDocEnds[i];
|
||||
if (i < 3) {
|
||||
preamble.append(sampleDoc);
|
||||
}
|
||||
DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
|
||||
try (InputStream is = new ByteArrayInputStream(sampleDoc.getBytes(StandardCharsets.UTF_8))) {
|
||||
sampleRecords.add(docToMap(docBuilder.parse(is)));
|
||||
sampleMessages.add(sampleDoc);
|
||||
linesConsumed += numNewlinesIn(sampleDoc);
|
||||
} catch (SAXException e) {
|
||||
// Tolerate an incomplete last record as long as we have one complete record
|
||||
if (sampleRecords.isEmpty() || i < sampleDocEnds.length - 1) {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sample.endsWith("\n") == false) {
|
||||
++linesConsumed;
|
||||
}
|
||||
|
||||
// If we get here the XML parser should have confirmed this
|
||||
assert messagePrefix.charAt(0) == '<';
|
||||
String topLevelTag = messagePrefix.substring(1);
|
||||
|
||||
LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.XML)
|
||||
.setCharset(charsetName)
|
||||
.setHasByteOrderMarker(hasByteOrderMarker)
|
||||
.setSampleStart(preamble.toString())
|
||||
.setNumLinesAnalyzed(linesConsumed)
|
||||
.setNumMessagesAnalyzed(sampleRecords.size())
|
||||
.setMultilineStartPattern("^\\s*<" + topLevelTag);
|
||||
|
||||
Tuple<String, TimestampMatch> timeField = LogStructureUtils.guessTimestampField(explanation, sampleRecords);
|
||||
if (timeField != null) {
|
||||
structureBuilder.setTimestampField(timeField.v1())
|
||||
.setTimestampFormats(timeField.v2().dateFormats)
|
||||
.setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing());
|
||||
}
|
||||
|
||||
SortedMap<String, Object> innerMappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
|
||||
Map<String, Object> secondLevelProperties = new LinkedHashMap<>();
|
||||
secondLevelProperties.put(LogStructureUtils.MAPPING_TYPE_SETTING, "object");
|
||||
secondLevelProperties.put(LogStructureUtils.MAPPING_PROPERTIES_SETTING, innerMappings);
|
||||
SortedMap<String, Object> outerMappings = new TreeMap<>();
|
||||
outerMappings.put(topLevelTag, secondLevelProperties);
|
||||
outerMappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD,
|
||||
Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
|
||||
|
||||
LogStructure structure = structureBuilder
|
||||
.setMappings(outerMappings)
|
||||
.setExplanation(explanation)
|
||||
.build();
|
||||
|
||||
return new XmlLogStructureFinder(sampleMessages, structure);
|
||||
}
|
||||
|
||||
private XmlLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
|
||||
this.sampleMessages = Collections.unmodifiableList(sampleMessages);
|
||||
this.structure = structure;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getSampleMessages() {
|
||||
return sampleMessages;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LogStructure getStructure() {
|
||||
return structure;
|
||||
}
|
||||
|
||||
private static int numNewlinesIn(String str) {
|
||||
return (int) str.chars().filter(c -> c == '\n').count();
|
||||
}
|
||||
|
||||
private static Map<String, Object> docToMap(Document doc) {
|
||||
|
||||
Map<String, Object> docAsMap = new LinkedHashMap<>();
|
||||
|
||||
doc.getDocumentElement().normalize();
|
||||
addNodeToMap(doc.getDocumentElement(), docAsMap);
|
||||
|
||||
return docAsMap;
|
||||
}
|
||||
|
||||
private static void addNodeToMap(Node node, Map<String, Object> nodeAsMap) {
|
||||
|
||||
NamedNodeMap attributes = node.getAttributes();
|
||||
for (int i = 0; i < attributes.getLength(); ++i) {
|
||||
Node attribute = attributes.item(i);
|
||||
nodeAsMap.put(attribute.getNodeName(), attribute.getNodeValue());
|
||||
}
|
||||
|
||||
NodeList children = node.getChildNodes();
|
||||
for (int i = 0; i < children.getLength(); ++i) {
|
||||
Node child = children.item(i);
|
||||
if (child.getNodeType() == Node.ELEMENT_NODE) {
|
||||
if (child.getChildNodes().getLength() == 1) {
|
||||
Node grandChild = child.getChildNodes().item(0);
|
||||
String value = grandChild.getNodeValue().trim();
|
||||
if (value.isEmpty() == false) {
|
||||
nodeAsMap.put(child.getNodeName(), value);
|
||||
}
|
||||
} else {
|
||||
Map<String, Object> childNodeAsMap = new LinkedHashMap<>();
|
||||
addNodeToMap(child, childNodeAsMap);
|
||||
if (childNodeAsMap.isEmpty() == false) {
|
||||
nodeAsMap.put(child.getNodeName(), childNodeAsMap);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,122 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.stream.Location;
|
||||
import javax.xml.stream.XMLInputFactory;
|
||||
import javax.xml.stream.XMLStreamException;
|
||||
import javax.xml.stream.XMLStreamReader;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.List;
|
||||
|
||||
public class XmlLogStructureFinderFactory implements LogStructureFinderFactory {
|
||||
|
||||
private final XMLInputFactory xmlFactory;
|
||||
|
||||
public XmlLogStructureFinderFactory() {
|
||||
xmlFactory = XMLInputFactory.newInstance();
|
||||
xmlFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.FALSE);
|
||||
xmlFactory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
|
||||
}
|
||||
|
||||
/**
|
||||
* This format matches if the sample consists of one or more XML documents,
|
||||
* all with the same root element name. If there is more than one document,
|
||||
* only whitespace is allowed in between them. The last one does not
|
||||
* necessarily have to be complete (as the sample could have truncated it).
|
||||
*/
|
||||
@Override
|
||||
public boolean canCreateFromSample(List<String> explanation, String sample) {
|
||||
|
||||
int completeDocCount = 0;
|
||||
String commonRootElementName = null;
|
||||
String remainder = sample.trim();
|
||||
boolean mightBeAnotherDocument = !remainder.isEmpty();
|
||||
|
||||
// This processing is extremely complicated because it's necessary
|
||||
// to create a new XML stream reader per document, but each one
|
||||
// will read ahead so will potentially consume characters from the
|
||||
// following document. We must therefore also recreate the string
|
||||
// reader for each document.
|
||||
while (mightBeAnotherDocument) {
|
||||
|
||||
try (Reader reader = new StringReader(remainder)) {
|
||||
|
||||
XMLStreamReader xmlReader = xmlFactory.createXMLStreamReader(reader);
|
||||
try {
|
||||
int nestingLevel = 0;
|
||||
while ((mightBeAnotherDocument = xmlReader.hasNext())) {
|
||||
switch (xmlReader.next()) {
|
||||
case XMLStreamReader.START_ELEMENT:
|
||||
if (nestingLevel++ == 0) {
|
||||
String rootElementName = xmlReader.getLocalName();
|
||||
if (commonRootElementName == null) {
|
||||
commonRootElementName = rootElementName;
|
||||
} else if (commonRootElementName.equals(rootElementName) == false) {
|
||||
explanation.add("Not XML because different documents have different root " +
|
||||
"element names: [" + commonRootElementName + "] and [" + rootElementName + "]");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case XMLStreamReader.END_ELEMENT:
|
||||
if (--nestingLevel < 0) {
|
||||
explanation.add("Not XML because an end element occurs before a start element");
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (nestingLevel == 0) {
|
||||
++completeDocCount;
|
||||
// Find the position that's one character beyond end of the end element.
|
||||
// The next document (if there is one) must start after this (possibly
|
||||
// preceeded by whitespace).
|
||||
Location location = xmlReader.getLocation();
|
||||
int endPos = 0;
|
||||
// Line and column numbers start at 1, not 0
|
||||
for (int wholeLines = location.getLineNumber() - 1; wholeLines > 0; --wholeLines) {
|
||||
endPos = remainder.indexOf('\n', endPos) + 1;
|
||||
if (endPos == 0) {
|
||||
explanation.add("Not XML because XML parser location is inconsistent: line [" +
|
||||
location.getLineNumber() + "], column [" + location.getColumnNumber() + "] in [" + remainder + "]");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
endPos += location.getColumnNumber() - 1;
|
||||
remainder = remainder.substring(endPos).trim();
|
||||
mightBeAnotherDocument = !remainder.isEmpty();
|
||||
break;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
xmlReader.close();
|
||||
}
|
||||
} catch (IOException | XMLStreamException e) {
|
||||
explanation.add("Not XML because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (completeDocCount == 0) {
|
||||
explanation.add("Not XML because sample didn't contain a complete document");
|
||||
return false;
|
||||
}
|
||||
|
||||
explanation.add("Deciding sample is XML");
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
|
||||
throws IOException, ParserConfigurationException, SAXException {
|
||||
return XmlLogStructureFinder.makeXmlLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
public class CsvLogStructureFinderFactoryTests extends LogStructureTestCase {
|
||||
|
||||
private LogStructureFinderFactory factory = new CsvLogStructureFinderFactory();
|
||||
|
||||
// No need to check JSON or XML because they come earlier in the order we check formats
|
||||
|
||||
public void testCanCreateFromSampleGivenCsv() {
|
||||
|
||||
assertTrue(factory.canCreateFromSample(explanation, CSV_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenTsv() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenPipeSeparatedValues() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenText() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,326 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.xpack.ml.logstructurefinder.GrokPatternCreator.ValueOnlyGrokPatternCandidate;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.hamcrest.Matchers.containsInAnyOrder;
|
||||
|
||||
public class GrokPatternCreatorTests extends LogStructureTestCase {
|
||||
|
||||
public void testBuildFieldName() {
|
||||
Map<String, Integer> fieldNameCountStore = new HashMap<>();
|
||||
assertEquals("field", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
|
||||
assertEquals("field2", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
|
||||
assertEquals("field3", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
|
||||
assertEquals("extra_timestamp", GrokPatternCreator.buildFieldName(fieldNameCountStore, "extra_timestamp"));
|
||||
assertEquals("field4", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
|
||||
assertEquals("uri", GrokPatternCreator.buildFieldName(fieldNameCountStore, "uri"));
|
||||
assertEquals("extra_timestamp2", GrokPatternCreator.buildFieldName(fieldNameCountStore, "extra_timestamp"));
|
||||
assertEquals("field5", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
|
||||
}
|
||||
|
||||
public void testPopulatePrefacesAndEpiloguesGivenTimestamp() {
|
||||
|
||||
Collection<String> matchingStrings = Arrays.asList("[2018-01-25T15:33:23] DEBUG ",
|
||||
"[2018-01-24T12:33:23] ERROR ",
|
||||
"junk [2018-01-22T07:33:23] INFO ",
|
||||
"[2018-01-21T03:33:23] DEBUG ");
|
||||
ValueOnlyGrokPatternCandidate candidate = new ValueOnlyGrokPatternCandidate("TIMESTAMP_ISO8601", "date", "extra_timestamp");
|
||||
|
||||
Map<String, Integer> fieldNameCountStore = new HashMap<>();
|
||||
Collection<String> prefaces = new ArrayList<>();
|
||||
Collection<String> epilogues = new ArrayList<>();
|
||||
|
||||
candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null);
|
||||
|
||||
assertThat(prefaces, containsInAnyOrder("[", "[", "junk [", "["));
|
||||
assertThat(epilogues, containsInAnyOrder("] DEBUG ", "] ERROR ", "] INFO ", "] DEBUG "));
|
||||
}
|
||||
|
||||
public void testPopulatePrefacesAndEpiloguesGivenEmailAddress() {
|
||||
|
||||
Collection<String> matchingStrings = Arrays.asList("before alice@acme.com after",
|
||||
"abc bob@acme.com xyz",
|
||||
"carol@acme.com");
|
||||
ValueOnlyGrokPatternCandidate candidate = new ValueOnlyGrokPatternCandidate("EMAILADDRESS", "keyword", "email");
|
||||
|
||||
Map<String, Integer> fieldNameCountStore = new HashMap<>();
|
||||
Collection<String> prefaces = new ArrayList<>();
|
||||
Collection<String> epilogues = new ArrayList<>();
|
||||
|
||||
candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null);
|
||||
|
||||
assertThat(prefaces, containsInAnyOrder("before ", "abc ", ""));
|
||||
assertThat(epilogues, containsInAnyOrder(" after", " xyz", ""));
|
||||
}
|
||||
|
||||
public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() {
|
||||
|
||||
Collection<String> snippets = Arrays.asList("[2018-01-25T15:33:23] DEBUG ",
|
||||
"[2018-01-24T12:33:23] ERROR ",
|
||||
"junk [2018-01-22T07:33:23] INFO ",
|
||||
"[2018-01-21T03:33:23] DEBUG ");
|
||||
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
|
||||
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
|
||||
|
||||
assertEquals(".*?\\[%{TIMESTAMP_ISO8601:extra_timestamp}\\] %{LOGLEVEL:loglevel} ",
|
||||
grokPatternCreator.getOverallGrokPatternBuilder().toString());
|
||||
}
|
||||
|
||||
public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() {
|
||||
|
||||
Collection<String> snippets = Arrays.asList("(-2)",
|
||||
" (-3)",
|
||||
" (4)",
|
||||
" (-5) ");
|
||||
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
|
||||
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
|
||||
|
||||
assertEquals(".*?\\(%{INT:field}\\).*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
|
||||
}
|
||||
|
||||
public void testAppendBestGrokMatchForStringsGivenNegativeNumbersWithoutBreak() {
|
||||
|
||||
Collection<String> snippets = Arrays.asList("before-2 ",
|
||||
"prior to-3",
|
||||
"-4");
|
||||
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
|
||||
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
|
||||
|
||||
// It seems sensible that we don't detect these suffices as either base 10 or base 16 numbers
|
||||
assertEquals(".*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
|
||||
}
|
||||
|
||||
public void testAppendBestGrokMatchForStringsGivenHexNumbers() {
|
||||
|
||||
Collection<String> snippets = Arrays.asList(" abc",
|
||||
" 123",
|
||||
" -123",
|
||||
"1f is hex");
|
||||
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
|
||||
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
|
||||
|
||||
assertEquals(".*?%{BASE16NUM:field}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
|
||||
}
|
||||
|
||||
public void testAppendBestGrokMatchForStringsGivenHostnamesWithNumbers() {
|
||||
|
||||
Collection<String> snippets = Arrays.asList("<host1.1.p2ps:",
|
||||
"<host2.1.p2ps:");
|
||||
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
|
||||
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
|
||||
|
||||
// We don't want the .1. in the middle to get detected as a hex number
|
||||
assertEquals("<.*?:", grokPatternCreator.getOverallGrokPatternBuilder().toString());
|
||||
}
|
||||
|
||||
public void testAppendBestGrokMatchForStringsGivenEmailAddresses() {
|
||||
|
||||
Collection<String> snippets = Arrays.asList("before alice@acme.com after",
|
||||
"abc bob@acme.com xyz",
|
||||
"carol@acme.com");
|
||||
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
|
||||
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
|
||||
|
||||
assertEquals(".*?%{EMAILADDRESS:email}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
|
||||
}
|
||||
|
||||
public void testAppendBestGrokMatchForStringsGivenUris() {
|
||||
|
||||
Collection<String> snippets = Arrays.asList("main site https://www.elastic.co/ with trailing slash",
|
||||
"https://www.elastic.co/guide/en/x-pack/current/ml-configuring-categories.html#ml-configuring-categories is a section",
|
||||
"download today from https://www.elastic.co/downloads");
|
||||
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
|
||||
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
|
||||
|
||||
assertEquals(".*?%{URI:uri}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
|
||||
}
|
||||
|
||||
public void testAppendBestGrokMatchForStringsGivenPaths() {
|
||||
|
||||
Collection<String> snippets = Arrays.asList("on Mac /Users/dave",
|
||||
"on Windows C:\\Users\\dave",
|
||||
"on Linux /home/dave");
|
||||
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
|
||||
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
|
||||
|
||||
assertEquals(".*? .*? %{PATH:path}", grokPatternCreator.getOverallGrokPatternBuilder().toString());
|
||||
}
|
||||
|
||||
public void testAppendBestGrokMatchForStringsGivenKvPairs() {
|
||||
|
||||
Collection<String> snippets = Arrays.asList("foo=1 and bar=a",
|
||||
"something foo=2 bar=b something else",
|
||||
"foo=3 bar=c",
|
||||
" foo=1 bar=a ");
|
||||
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
|
||||
grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
|
||||
|
||||
assertEquals(".*?\\bfoo=%{USER:foo} .*?\\bbar=%{USER:bar}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
|
||||
}
|
||||
|
||||
public void testCreateGrokPatternFromExamplesGivenNamedLogs() {
|
||||
|
||||
Collection<String> sampleMessages = Arrays.asList(
|
||||
"Sep 8 11:55:06 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'elastic.slack.com/A/IN': 95.110.64.205#53",
|
||||
"Sep 8 11:55:08 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'slack-imgs.com/A/IN': 95.110.64.205#53",
|
||||
"Sep 8 11:55:35 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53",
|
||||
"Sep 8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53");
|
||||
|
||||
Map<String, Object> mappings = new HashMap<>();
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
|
||||
|
||||
assertEquals("%{SYSLOGTIMESTAMP:timestamp} .*? .*?\\[%{INT:field}\\]: %{LOGLEVEL:loglevel} \\(.*? .*? .*?\\) .*? " +
|
||||
"%{QUOTEDSTRING:field2}: %{IP:ipaddress}#%{INT:field3}",
|
||||
grokPatternCreator.createGrokPatternFromExamples("SYSLOGTIMESTAMP", "timestamp"));
|
||||
assertEquals(5, mappings.size());
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("field2"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field3"));
|
||||
}
|
||||
|
||||
public void testCreateGrokPatternFromExamplesGivenCatalinaLogs() {
|
||||
|
||||
Collection<String> sampleMessages = Arrays.asList(
|
||||
"Aug 29, 2009 12:03:33 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
|
||||
"Invalid chunk ignored.",
|
||||
"Aug 29, 2009 12:03:40 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
|
||||
"Invalid chunk ignored.",
|
||||
"Aug 29, 2009 12:03:45 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
|
||||
"Invalid chunk ignored.",
|
||||
"Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
|
||||
"Invalid chunk ignored.");
|
||||
|
||||
Map<String, Object> mappings = new HashMap<>();
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
|
||||
|
||||
assertEquals("%{CATALINA_DATESTAMP:timestamp} .*? .*?\\n%{LOGLEVEL:loglevel}: .*",
|
||||
grokPatternCreator.createGrokPatternFromExamples("CATALINA_DATESTAMP", "timestamp"));
|
||||
assertEquals(1, mappings.size());
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel"));
|
||||
}
|
||||
|
||||
public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogs() {
|
||||
|
||||
// Two timestamps: one local, one UTC
|
||||
Collection<String> sampleMessages = Arrays.asList(
|
||||
"559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" +
|
||||
"Info\tsshd\tsubsystem request for sftp",
|
||||
"559550912548986880\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" +
|
||||
"Info\tsshd\tsubsystem request for sftp",
|
||||
"559550912548986887\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" +
|
||||
"Info\tsshd\tsubsystem request for sftp",
|
||||
"559550912603512850\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" +
|
||||
"Info\tsshd\tsubsystem request for sftp");
|
||||
|
||||
Map<String, Object> mappings = new HashMap<>();
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
|
||||
|
||||
assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" +
|
||||
"%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*",
|
||||
grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp"));
|
||||
assertEquals(5, mappings.size());
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"),
|
||||
mappings.get("extra_timestamp"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field2"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel"));
|
||||
}
|
||||
|
||||
public void testFindFullLineGrokPatternGivenApacheCombinedLogs() {
|
||||
Collection<String> sampleMessages = Arrays.asList(
|
||||
"83.149.9.216 - - [19/Jan/2016:08:13:42 +0000] " +
|
||||
"\"GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1\" 200 203023 " +
|
||||
"\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"",
|
||||
"83.149.9.216 - - [19/Jan/2016:08:13:44 +0000] " +
|
||||
"\"GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1\" 200 7697 " +
|
||||
"\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"",
|
||||
"83.149.9.216 - - [19/Jan/2016:08:13:44 +0000] " +
|
||||
"\"GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1\" 200 26185 " +
|
||||
"\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"",
|
||||
"83.149.9.216 - - [19/Jan/2016:08:13:42 +0000] " +
|
||||
"\"GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1\" 200 430406 " +
|
||||
"\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"");
|
||||
|
||||
Map<String, Object> mappings = new HashMap<>();
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
|
||||
|
||||
assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), grokPatternCreator.findFullLineGrokPattern());
|
||||
assertEquals(10, mappings.size());
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "text"), mappings.get("agent"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("auth"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("bytes"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("clientip"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "double"), mappings.get("httpversion"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("ident"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("referrer"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("request"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("response"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("verb"));
|
||||
}
|
||||
|
||||
public void testAdjustForPunctuationGivenCommonPrefix() {
|
||||
Collection<String> snippets = Arrays.asList(
|
||||
"\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.33.212\",\"No-lookup\",\"192.168.33.132\"," +
|
||||
"\"80\",\"46721\",\"/Common/Subnet_33\",\"TCP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" +
|
||||
",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"",
|
||||
"\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.143.244\",\"No-lookup\",\"192.168.33.106\"," +
|
||||
"\"55025\",\"162\",\"/Common/Subnet_33\",\"UDP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" +
|
||||
",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"",
|
||||
"\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.33.3\",\"No-lookup\",\"224.0.0.102\"," +
|
||||
"\"3222\",\"3222\",\"/Common/Subnet_33\",\"UDP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" +
|
||||
",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\""
|
||||
);
|
||||
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
|
||||
Collection<String> adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets);
|
||||
|
||||
assertEquals("\",", grokPatternCreator.getOverallGrokPatternBuilder().toString());
|
||||
assertNotNull(adjustedSnippets);
|
||||
assertThat(new ArrayList<>(adjustedSnippets),
|
||||
containsInAnyOrder(snippets.stream().map(snippet -> snippet.substring(2)).toArray(String[]::new)));
|
||||
}
|
||||
|
||||
public void testAdjustForPunctuationGivenNoCommonPrefix() {
|
||||
Collection<String> snippets = Arrays.asList(
|
||||
"|client (id:2) was removed from servergroup 'Normal'(id:7) by client 'User1'(id:2)",
|
||||
"|servergroup 'GAME'(id:9) was added by 'User1'(id:2)",
|
||||
"|permission 'i_group_auto_update_type'(id:146) with values (value:30, negated:0, skipchannel:0) " +
|
||||
"was added by 'User1'(id:2) to servergroup 'GAME'(id:9)"
|
||||
);
|
||||
|
||||
GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
|
||||
Collection<String> adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets);
|
||||
|
||||
assertEquals("", grokPatternCreator.getOverallGrokPatternBuilder().toString());
|
||||
assertSame(snippets, adjustedSnippets);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
public class JsonLogStructureFinderFactoryTests extends LogStructureTestCase {
|
||||
|
||||
private LogStructureFinderFactory factory = new JsonLogStructureFinderFactory();
|
||||
|
||||
public void testCanCreateFromSampleGivenJson() {
|
||||
|
||||
assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenXml() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, XML_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenCsv() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenTsv() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenPipeSeparatedValues() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenText() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import java.util.Collections;
|
||||
|
||||
public class JsonLogStructureFinderTests extends LogStructureTestCase {
|
||||
|
||||
private LogStructureFinderFactory factory = new JsonLogStructureFinderFactory();
|
||||
|
||||
public void testCreateConfigsGivenGoodJson() throws Exception {
|
||||
assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
|
||||
|
||||
String charset = randomFrom(POSSIBLE_CHARSETS);
|
||||
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
||||
LogStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker);
|
||||
|
||||
LogStructure structure = structureFinder.getStructure();
|
||||
|
||||
assertEquals(LogStructure.Format.JSON, structure.getFormat());
|
||||
assertEquals(charset, structure.getCharset());
|
||||
if (hasByteOrderMarker == null) {
|
||||
assertNull(structure.getHasByteOrderMarker());
|
||||
} else {
|
||||
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
|
||||
}
|
||||
assertNull(structure.getExcludeLinesPattern());
|
||||
assertNull(structure.getMultilineStartPattern());
|
||||
assertNull(structure.getSeparator());
|
||||
assertNull(structure.getHasHeaderRow());
|
||||
assertNull(structure.getShouldTrimFields());
|
||||
assertNull(structure.getGrokPattern());
|
||||
assertEquals("timestamp", structure.getTimestampField());
|
||||
assertEquals(Collections.singletonList("UNIX_MS"), structure.getTimestampFormats());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import com.ibm.icu.text.CharsetMatch;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
|
||||
import static org.hamcrest.Matchers.startsWith;
|
||||
import static org.hamcrest.core.IsInstanceOf.instanceOf;
|
||||
|
||||
public class LogStructureFinderManagerTests extends LogStructureTestCase {
|
||||
|
||||
private LogStructureFinderManager structureFinderManager = new LogStructureFinderManager();
|
||||
|
||||
public void testFindCharsetGivenCharacterWidths() throws Exception {
|
||||
|
||||
for (Charset charset : Arrays.asList(StandardCharsets.UTF_8, StandardCharsets.UTF_16LE, StandardCharsets.UTF_16BE)) {
|
||||
CharsetMatch charsetMatch = structureFinderManager.findCharset(explanation,
|
||||
new ByteArrayInputStream(TEXT_SAMPLE.getBytes(charset)));
|
||||
assertEquals(charset.name(), charsetMatch.getName());
|
||||
}
|
||||
}
|
||||
|
||||
public void testFindCharsetGivenBinary() throws Exception {
|
||||
|
||||
// This input should never match a single byte character set. ICU4J will sometimes decide
|
||||
// that it matches a double byte character set, hence the two assertion branches.
|
||||
int size = 1000;
|
||||
byte[] binaryBytes = randomByteArrayOfLength(size);
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
binaryBytes[randomIntBetween(0, size - 1)] = 0;
|
||||
}
|
||||
|
||||
try {
|
||||
CharsetMatch charsetMatch = structureFinderManager.findCharset(explanation, new ByteArrayInputStream(binaryBytes));
|
||||
assertThat(charsetMatch.getName(), startsWith("UTF-16"));
|
||||
} catch (IllegalArgumentException e) {
|
||||
assertEquals("Could not determine a usable character encoding for the input - could it be binary data?", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
public void testMakeBestStructureGivenJson() throws Exception {
|
||||
assertThat(structureFinderManager.makeBestStructureFinder(explanation,
|
||||
"{ \"time\": \"2018-05-17T13:41:23\", \"message\": \"hello\" }", StandardCharsets.UTF_8.name(), randomBoolean()),
|
||||
instanceOf(JsonLogStructureFinder.class));
|
||||
}
|
||||
|
||||
public void testMakeBestStructureGivenXml() throws Exception {
|
||||
assertThat(structureFinderManager.makeBestStructureFinder(explanation,
|
||||
"<log time=\"2018-05-17T13:41:23\"><message>hello</message></log>", StandardCharsets.UTF_8.name(), randomBoolean()),
|
||||
instanceOf(XmlLogStructureFinder.class));
|
||||
}
|
||||
|
||||
public void testMakeBestStructureGivenCsv() throws Exception {
|
||||
assertThat(structureFinderManager.makeBestStructureFinder(explanation, "time,message\n" +
|
||||
"2018-05-17T13:41:23,hello\n", StandardCharsets.UTF_8.name(), randomBoolean()),
|
||||
instanceOf(SeparatedValuesLogStructureFinder.class));
|
||||
}
|
||||
|
||||
public void testMakeBestStructureGivenText() throws Exception {
|
||||
assertThat(structureFinderManager.makeBestStructureFinder(explanation, "[2018-05-17T13:41:23] hello\n" +
|
||||
"[2018-05-17T13:41:24] hello again\n", StandardCharsets.UTF_8.name(), randomBoolean()),
|
||||
instanceOf(TextLogStructureFinder.class));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,86 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.logging.Loggers;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public abstract class LogStructureTestCase extends ESTestCase {
|
||||
|
||||
protected static final List<String> POSSIBLE_CHARSETS = Collections.unmodifiableList(Charset.availableCharsets().keySet().stream()
|
||||
.filter(name -> LogStructureFinderManager.FILEBEAT_SUPPORTED_ENCODINGS.contains(name.toLowerCase(Locale.ROOT)))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
protected static final String CSV_SAMPLE = "time,id,value\n" +
|
||||
"2018-05-17T16:23:40,key1,42.0\n" +
|
||||
"2018-05-17T16:24:11,\"key with spaces\",42.0\n";
|
||||
|
||||
protected static final String JSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
|
||||
"\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 1\",\"class\":\"ml\"," +
|
||||
"\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n" +
|
||||
"{\"logger\":\"controller\",\"timestamp\":1478261151445," +
|
||||
"\"level\":\"INFO\",\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 2\",\"class\":\"ml\"," +
|
||||
"\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n";
|
||||
|
||||
protected static final String PIPE_SEPARATED_VALUES_SAMPLE = "2018-01-06 16:56:14.295748|INFO |VirtualServer |1 |" +
|
||||
"listening on 0.0.0.0:9987, :::9987\n" +
|
||||
"2018-01-06 17:19:44.465252|INFO |VirtualServer |1 |client " +
|
||||
"'User1'(id:2) changed default admin channelgroup to 'Guest'(id:8)\n" +
|
||||
"2018-01-06 17:21:25.764368|INFO |VirtualServer |1 |client " +
|
||||
"'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel 'Default Channel'(id:1)";
|
||||
|
||||
protected static final String SEMI_COLON_SEPARATED_VALUES_SAMPLE = "\"pos_id\";\"trip_id\";\"latitude\";\"longitude\";\"altitude\";" +
|
||||
"\"timestamp\"\n" +
|
||||
"\"1\";\"3\";\"4703.7815\";\"1527.4713\";\"359.9\";\"2017-01-19 16:19:04.742113\"\n" +
|
||||
"\"2\";\"3\";\"4703.7815\";\"1527.4714\";\"359.9\";\"2017-01-19 16:19:05.741890\"\n" +
|
||||
"\"3\";\"3\";\"4703.7816\";\"1527.4716\";\"360.3\";\"2017-01-19 16:19:06.738842\"";
|
||||
|
||||
protected static final String TEXT_SAMPLE = "[2018-05-11T17:07:29,461][INFO ][o.e.n.Node ] [node-0] initializing ...\n" +
|
||||
"[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], " +
|
||||
"net usable_space [223.4gb], net total_space [464.7gb], types [hfs]\n" +
|
||||
"[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [3.9gb], " +
|
||||
"compressed ordinary object pointers [true]\n" +
|
||||
"[2018-05-11T17:07:29,556][INFO ][o.e.n.Node ] [node-0] node name [node-0], node ID [tJ9u8HcaTbWxRtnlfz1RQA]\n";
|
||||
|
||||
protected static final String TSV_SAMPLE = "time\tid\tvalue\n" +
|
||||
"2018-05-17T16:23:40\tkey1\t42.0\n" +
|
||||
"2018-05-17T16:24:11\t\"key with spaces\"\t42.0\n";
|
||||
|
||||
protected static final String XML_SAMPLE = "<log4j:event logger=\"autodetect\" timestamp=\"1526574809521\" level=\"ERROR\" " +
|
||||
"thread=\"0x7fffc5a7c3c0\">\n" +
|
||||
"<log4j:message><![CDATA[Neither a fieldname clause nor a field config file was specified]]></log4j:message>\n" +
|
||||
"</log4j:event>\n" +
|
||||
"\n" +
|
||||
"<log4j:event logger=\"autodetect\" timestamp=\"1526574809522\" level=\"FATAL\" thread=\"0x7fffc5a7c3c0\">\n" +
|
||||
"<log4j:message><![CDATA[Field config could not be interpreted]]></log4j:message>\n" +
|
||||
"</log4j:event>\n" +
|
||||
"\n";
|
||||
|
||||
protected List<String> explanation;
|
||||
|
||||
@Before
|
||||
public void initExplanation() {
|
||||
explanation = new ArrayList<>();
|
||||
}
|
||||
|
||||
@After
|
||||
public void printExplanation() {
|
||||
Loggers.getLogger(getClass()).info("Explanation:\n" + String.join("\n", explanation));
|
||||
}
|
||||
|
||||
protected Boolean randomHasByteOrderMarker(String charset) {
|
||||
return charset.toUpperCase(Locale.ROOT).startsWith("UTF") ? randomBoolean() : null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.test.AbstractXContentTestCase;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
public class LogStructureTests extends AbstractXContentTestCase<LogStructure> {
|
||||
|
||||
protected LogStructure createTestInstance() {
|
||||
|
||||
LogStructure.Format format = randomFrom(EnumSet.allOf(LogStructure.Format.class));
|
||||
|
||||
LogStructure.Builder builder = new LogStructure.Builder(format);
|
||||
|
||||
int numLinesAnalyzed = randomIntBetween(2, 10000);
|
||||
builder.setNumLinesAnalyzed(numLinesAnalyzed);
|
||||
int numMessagesAnalyzed = randomIntBetween(1, numLinesAnalyzed);
|
||||
builder.setNumMessagesAnalyzed(numMessagesAnalyzed);
|
||||
builder.setSampleStart(randomAlphaOfLength(1000));
|
||||
|
||||
String charset = randomFrom(Charset.availableCharsets().keySet());
|
||||
builder.setCharset(charset);
|
||||
if (charset.toUpperCase(Locale.ROOT).startsWith("UTF")) {
|
||||
builder.setHasByteOrderMarker(randomBoolean());
|
||||
}
|
||||
|
||||
if (numMessagesAnalyzed < numLinesAnalyzed) {
|
||||
builder.setMultilineStartPattern(randomAlphaOfLength(100));
|
||||
}
|
||||
if (randomBoolean()) {
|
||||
builder.setExcludeLinesPattern(randomAlphaOfLength(100));
|
||||
}
|
||||
|
||||
if (format.isSeparatedValues() || (format.supportsNesting() && randomBoolean())) {
|
||||
builder.setInputFields(Arrays.asList(generateRandomStringArray(10, 10, false, false)));
|
||||
}
|
||||
if (format.isSeparatedValues()) {
|
||||
builder.setHasHeaderRow(randomBoolean());
|
||||
if (rarely()) {
|
||||
builder.setSeparator(format.separator());
|
||||
}
|
||||
}
|
||||
if (format.isSemiStructured()) {
|
||||
builder.setGrokPattern(randomAlphaOfLength(100));
|
||||
}
|
||||
|
||||
if (format.isSemiStructured() || randomBoolean()) {
|
||||
builder.setTimestampField(randomAlphaOfLength(10));
|
||||
builder.setTimestampFormats(Arrays.asList(generateRandomStringArray(3, 20, false, false)));
|
||||
builder.setNeedClientTimezone(randomBoolean());
|
||||
}
|
||||
|
||||
Map<String, Object> mappings = new TreeMap<>();
|
||||
for (String field : generateRandomStringArray(5, 20, false, false)) {
|
||||
mappings.put(field, Collections.singletonMap(randomAlphaOfLength(5), randomAlphaOfLength(10)));
|
||||
}
|
||||
builder.setMappings(mappings);
|
||||
|
||||
builder.setExplanation(Arrays.asList(generateRandomStringArray(10, 150, false, false)));
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
protected LogStructure doParseInstance(XContentParser parser) {
|
||||
return LogStructure.PARSER.apply(parser, null).build();
|
||||
}
|
||||
|
||||
protected boolean supportsUnknownFields() {
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,292 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.hamcrest.Matchers.contains;
|
||||
|
||||
public class LogStructureUtilsTests extends LogStructureTestCase {
|
||||
|
||||
public void testMoreLikelyGivenText() {
|
||||
assertTrue(LogStructureUtils.isMoreLikelyTextThanKeyword("the quick brown fox jumped over the lazy dog"));
|
||||
assertTrue(LogStructureUtils.isMoreLikelyTextThanKeyword(randomAlphaOfLengthBetween(257, 10000)));
|
||||
}
|
||||
|
||||
public void testMoreLikelyGivenKeyword() {
|
||||
assertFalse(LogStructureUtils.isMoreLikelyTextThanKeyword("1"));
|
||||
assertFalse(LogStructureUtils.isMoreLikelyTextThanKeyword("DEBUG"));
|
||||
assertFalse(LogStructureUtils.isMoreLikelyTextThanKeyword(randomAlphaOfLengthBetween(1, 256)));
|
||||
}
|
||||
|
||||
public void testSingleSampleSingleField() {
|
||||
Map<String, String> sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
|
||||
Tuple<String, TimestampMatch> match =
|
||||
LogStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample));
|
||||
assertNotNull(match);
|
||||
assertEquals("field1", match.v1());
|
||||
assertThat(match.v2().dateFormats, contains("ISO8601"));
|
||||
assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
|
||||
}
|
||||
|
||||
public void testSamplesWithSameSingleTimeField() {
|
||||
Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
|
||||
Map<String, String> sample2 = Collections.singletonMap("field1", "2018-05-24T17:33:39,406");
|
||||
Tuple<String, TimestampMatch> match =
|
||||
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
|
||||
assertNotNull(match);
|
||||
assertEquals("field1", match.v1());
|
||||
assertThat(match.v2().dateFormats, contains("ISO8601"));
|
||||
assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
|
||||
}
|
||||
|
||||
public void testSamplesWithOneSingleTimeFieldDifferentFormat() {
|
||||
Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
|
||||
Map<String, String> sample2 = Collections.singletonMap("field1", "2018-05-24 17:33:39,406");
|
||||
Tuple<String, TimestampMatch> match =
|
||||
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
|
||||
assertNull(match);
|
||||
}
|
||||
|
||||
public void testSamplesWithDifferentSingleTimeField() {
|
||||
Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
|
||||
Map<String, String> sample2 = Collections.singletonMap("another_field", "2018-05-24T17:33:39,406");
|
||||
Tuple<String, TimestampMatch> match =
|
||||
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
|
||||
assertNull(match);
|
||||
}
|
||||
|
||||
public void testSingleSampleManyFieldsOneTimeFormat() {
|
||||
Map<String, Object> sample = new LinkedHashMap<>();
|
||||
sample.put("foo", "not a time");
|
||||
sample.put("time", "2018-05-24 17:28:31,735");
|
||||
sample.put("bar", 42);
|
||||
Tuple<String, TimestampMatch> match =
|
||||
LogStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample));
|
||||
assertNotNull(match);
|
||||
assertEquals("time", match.v1());
|
||||
assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
|
||||
assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
|
||||
}
|
||||
|
||||
public void testSamplesWithManyFieldsSameSingleTimeFormat() {
|
||||
Map<String, Object> sample1 = new LinkedHashMap<>();
|
||||
sample1.put("foo", "not a time");
|
||||
sample1.put("time", "2018-05-24 17:28:31,735");
|
||||
sample1.put("bar", 42);
|
||||
Map<String, Object> sample2 = new LinkedHashMap<>();
|
||||
sample2.put("foo", "whatever");
|
||||
sample2.put("time", "2018-05-29 11:53:02,837");
|
||||
sample2.put("bar", 17);
|
||||
Tuple<String, TimestampMatch> match =
|
||||
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
|
||||
assertNotNull(match);
|
||||
assertEquals("time", match.v1());
|
||||
assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
|
||||
assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
|
||||
}
|
||||
|
||||
public void testSamplesWithManyFieldsSameTimeFieldDifferentTimeFormat() {
|
||||
Map<String, Object> sample1 = new LinkedHashMap<>();
|
||||
sample1.put("foo", "not a time");
|
||||
sample1.put("time", "2018-05-24 17:28:31,735");
|
||||
sample1.put("bar", 42);
|
||||
Map<String, Object> sample2 = new LinkedHashMap<>();
|
||||
sample2.put("foo", "whatever");
|
||||
sample2.put("time", "May 29 2018 11:53:02");
|
||||
sample2.put("bar", 17);
|
||||
Tuple<String, TimestampMatch> match =
|
||||
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
|
||||
assertNull(match);
|
||||
}
|
||||
|
||||
public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionBefore() {
|
||||
Map<String, Object> sample1 = new LinkedHashMap<>();
|
||||
sample1.put("red_herring", "May 29 2007 11:53:02");
|
||||
sample1.put("time", "2018-05-24 17:28:31,735");
|
||||
sample1.put("bar", 42);
|
||||
Map<String, Object> sample2 = new LinkedHashMap<>();
|
||||
sample2.put("red_herring", "whatever");
|
||||
sample2.put("time", "2018-05-29 11:53:02,837");
|
||||
sample2.put("bar", 17);
|
||||
Tuple<String, TimestampMatch> match =
|
||||
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
|
||||
assertNotNull(match);
|
||||
assertEquals("time", match.v1());
|
||||
assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
|
||||
assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
|
||||
}
|
||||
|
||||
public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionAfter() {
|
||||
Map<String, Object> sample1 = new LinkedHashMap<>();
|
||||
sample1.put("foo", "not a time");
|
||||
sample1.put("time", "May 24 2018 17:28:31");
|
||||
sample1.put("red_herring", "2018-05-24 17:28:31,735");
|
||||
Map<String, Object> sample2 = new LinkedHashMap<>();
|
||||
sample2.put("foo", "whatever");
|
||||
sample2.put("time", "May 29 2018 11:53:02");
|
||||
sample2.put("red_herring", "17");
|
||||
Tuple<String, TimestampMatch> match =
|
||||
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
|
||||
assertNotNull(match);
|
||||
assertEquals("time", match.v1());
|
||||
assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"));
|
||||
assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName);
|
||||
}
|
||||
|
||||
public void testSamplesWithManyFieldsInconsistentTimeFields() {
|
||||
Map<String, Object> sample1 = new LinkedHashMap<>();
|
||||
sample1.put("foo", "not a time");
|
||||
sample1.put("time1", "May 24 2018 17:28:31");
|
||||
sample1.put("bar", 17);
|
||||
Map<String, Object> sample2 = new LinkedHashMap<>();
|
||||
sample2.put("foo", "whatever");
|
||||
sample2.put("time2", "May 29 2018 11:53:02");
|
||||
sample2.put("bar", 42);
|
||||
Tuple<String, TimestampMatch> match =
|
||||
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
|
||||
assertNull(match);
|
||||
}
|
||||
|
||||
public void testSamplesWithManyFieldsInconsistentAndConsistentTimeFields() {
|
||||
Map<String, Object> sample1 = new LinkedHashMap<>();
|
||||
sample1.put("foo", "not a time");
|
||||
sample1.put("time1", "2018-05-09 17:28:31,735");
|
||||
sample1.put("time2", "May 9 2018 17:28:31");
|
||||
sample1.put("bar", 17);
|
||||
Map<String, Object> sample2 = new LinkedHashMap<>();
|
||||
sample2.put("foo", "whatever");
|
||||
sample2.put("time2", "May 10 2018 11:53:02");
|
||||
sample2.put("time3", "Thu, May 10 2018 11:53:02");
|
||||
sample2.put("bar", 42);
|
||||
Tuple<String, TimestampMatch> match =
|
||||
LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
|
||||
assertNotNull(match);
|
||||
assertEquals("time2", match.v1());
|
||||
assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"));
|
||||
assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName);
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenNothing() {
|
||||
assertNull(LogStructureUtils.guessMapping(explanation, "foo", Collections.emptyList()));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenKeyword() {
|
||||
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword");
|
||||
|
||||
assertEquals(expected,
|
||||
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("ERROR", "INFO", "DEBUG")));
|
||||
assertEquals(expected,
|
||||
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "not a date")));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenText() {
|
||||
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "text");
|
||||
|
||||
assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo",
|
||||
Arrays.asList("a", "the quick brown fox jumped over the lazy dog")));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenIp() {
|
||||
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip");
|
||||
|
||||
assertEquals(expected,
|
||||
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("10.0.0.1", "172.16.0.1", "192.168.0.1")));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenDouble() {
|
||||
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "double");
|
||||
|
||||
assertEquals(expected,
|
||||
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("3.14159265359", "0", "-8")));
|
||||
// 12345678901234567890 is too long for long
|
||||
assertEquals(expected,
|
||||
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("1", "2", "12345678901234567890")));
|
||||
assertEquals(expected,
|
||||
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(3.14159265359, 0.0, 1e-308)));
|
||||
assertEquals(expected,
|
||||
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("-1e-1", "-1e308", "1e-308")));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenLong() {
|
||||
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long");
|
||||
|
||||
assertEquals(expected,
|
||||
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("500", "3", "-3")));
|
||||
assertEquals(expected,
|
||||
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(500, 6, 0)));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenDate() {
|
||||
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date");
|
||||
|
||||
assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo",
|
||||
Arrays.asList("2018-06-11T13:26:47Z", "2018-06-11T13:27:12Z")));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenBoolean() {
|
||||
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "boolean");
|
||||
|
||||
assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("false", "true")));
|
||||
assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(true, false)));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenArray() {
|
||||
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long");
|
||||
|
||||
assertEquals(expected,
|
||||
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(42, Arrays.asList(1, -99))));
|
||||
|
||||
expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword");
|
||||
|
||||
assertEquals(expected,
|
||||
LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(new String[]{ "x", "y" }, "z")));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenObject() {
|
||||
Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "object");
|
||||
|
||||
assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo",
|
||||
Arrays.asList(Collections.singletonMap("name", "value1"), Collections.singletonMap("name", "value2"))));
|
||||
}
|
||||
|
||||
public void testGuessMappingGivenObjectAndNonObject() {
|
||||
RuntimeException e = expectThrows(RuntimeException.class, () -> LogStructureUtils.guessMapping(explanation,
|
||||
"foo", Arrays.asList(Collections.singletonMap("name", "value1"), "value2")));
|
||||
|
||||
assertEquals("Field [foo] has both object and non-object values - this is not supported by Elasticsearch", e.getMessage());
|
||||
}
|
||||
|
||||
public void testGuessMappings() {
|
||||
Map<String, Object> sample1 = new LinkedHashMap<>();
|
||||
sample1.put("foo", "not a time");
|
||||
sample1.put("time", "2018-05-24 17:28:31,735");
|
||||
sample1.put("bar", 42);
|
||||
sample1.put("nothing", null);
|
||||
Map<String, Object> sample2 = new LinkedHashMap<>();
|
||||
sample2.put("foo", "whatever");
|
||||
sample2.put("time", "2018-05-29 11:53:02,837");
|
||||
sample2.put("bar", 17);
|
||||
sample2.put("nothing", null);
|
||||
|
||||
Map<String, Object> mappings = LogStructureUtils.guessMappings(explanation, Arrays.asList(sample1, sample2));
|
||||
assertNotNull(mappings);
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("foo"));
|
||||
Map<String, String> expectedTimeMapping = new HashMap<>();
|
||||
expectedTimeMapping.put(LogStructureUtils.MAPPING_TYPE_SETTING, "date");
|
||||
expectedTimeMapping.put(LogStructureUtils.MAPPING_FORMAT_SETTING, "YYYY-MM-dd HH:mm:ss,SSS");
|
||||
assertEquals(expectedTimeMapping, mappings.get("time"));
|
||||
assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("bar"));
|
||||
assertNull(mappings.get("nothing"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
public class PipeSeparatedValuesLogStructureFinderFactoryTests extends LogStructureTestCase {
|
||||
|
||||
private LogStructureFinderFactory factory = new PipeSeparatedValuesLogStructureFinderFactory();
|
||||
|
||||
// No need to check JSON, XML, CSV, TSV or semi-colon separated values because they come earlier in the order we check formats
|
||||
|
||||
public void testCanCreateFromSampleGivenPipeSeparatedValues() {
|
||||
|
||||
assertTrue(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenText() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
public class SemiColonSeparatedValuesLogStructureFinderFactoryTests extends LogStructureTestCase {
|
||||
|
||||
private LogStructureFinderFactory factory = new SemiColonSeparatedValuesLogStructureFinderFactory();
|
||||
|
||||
// No need to check JSON, XML, CSV or TSV because they come earlier in the order we check formats
|
||||
|
||||
public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
|
||||
|
||||
assertTrue(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenPipeSeparatedValues() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenText() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,293 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.supercsv.prefs.CsvPreference;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
|
||||
import static org.elasticsearch.xpack.ml.logstructurefinder.SeparatedValuesLogStructureFinder.levenshteinFieldwiseCompareRows;
|
||||
import static org.elasticsearch.xpack.ml.logstructurefinder.SeparatedValuesLogStructureFinder.levenshteinDistance;
|
||||
import static org.hamcrest.Matchers.arrayContaining;
|
||||
|
||||
public class SeparatedValuesLogStructureFinderTests extends LogStructureTestCase {
|
||||
|
||||
private LogStructureFinderFactory factory = new CsvLogStructureFinderFactory();
|
||||
|
||||
public void testCreateConfigsGivenCompleteCsv() throws Exception {
|
||||
String sample = "time,message\n" +
|
||||
"2018-05-17T13:41:23,hello\n" +
|
||||
"2018-05-17T13:41:32,hello again\n";
|
||||
assertTrue(factory.canCreateFromSample(explanation, sample));
|
||||
|
||||
String charset = randomFrom(POSSIBLE_CHARSETS);
|
||||
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
||||
LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
|
||||
|
||||
LogStructure structure = structureFinder.getStructure();
|
||||
|
||||
assertEquals(LogStructure.Format.CSV, structure.getFormat());
|
||||
assertEquals(charset, structure.getCharset());
|
||||
if (hasByteOrderMarker == null) {
|
||||
assertNull(structure.getHasByteOrderMarker());
|
||||
} else {
|
||||
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
|
||||
}
|
||||
assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern());
|
||||
assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
|
||||
assertEquals(Character.valueOf(','), structure.getSeparator());
|
||||
assertTrue(structure.getHasHeaderRow());
|
||||
assertNull(structure.getShouldTrimFields());
|
||||
assertEquals(Arrays.asList("time", "message"), structure.getInputFields());
|
||||
assertNull(structure.getGrokPattern());
|
||||
assertEquals("time", structure.getTimestampField());
|
||||
assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
|
||||
}
|
||||
|
||||
public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception {
|
||||
String sample = "message,time,count\n" +
|
||||
"\"hello\n" +
|
||||
"world\",2018-05-17T13:41:23,1\n" +
|
||||
"\"hello again\n"; // note that this last record is truncated
|
||||
assertTrue(factory.canCreateFromSample(explanation, sample));
|
||||
|
||||
String charset = randomFrom(POSSIBLE_CHARSETS);
|
||||
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
||||
LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
|
||||
|
||||
LogStructure structure = structureFinder.getStructure();
|
||||
|
||||
assertEquals(LogStructure.Format.CSV, structure.getFormat());
|
||||
assertEquals(charset, structure.getCharset());
|
||||
if (hasByteOrderMarker == null) {
|
||||
assertNull(structure.getHasByteOrderMarker());
|
||||
} else {
|
||||
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
|
||||
}
|
||||
assertEquals("^\"?message\"?,\"?time\"?,\"?count\"?", structure.getExcludeLinesPattern());
|
||||
assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
|
||||
assertEquals(Character.valueOf(','), structure.getSeparator());
|
||||
assertTrue(structure.getHasHeaderRow());
|
||||
assertNull(structure.getShouldTrimFields());
|
||||
assertEquals(Arrays.asList("message", "time", "count"), structure.getInputFields());
|
||||
assertNull(structure.getGrokPattern());
|
||||
assertEquals("time", structure.getTimestampField());
|
||||
assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
|
||||
}
|
||||
|
||||
public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception {
|
||||
String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," +
|
||||
"store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," +
|
||||
"improvement_surcharge,total_amount,,\n" +
|
||||
"2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
|
||||
"1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
|
||||
"1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
|
||||
assertTrue(factory.canCreateFromSample(explanation, sample));
|
||||
|
||||
String charset = randomFrom(POSSIBLE_CHARSETS);
|
||||
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
||||
LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
|
||||
|
||||
LogStructure structure = structureFinder.getStructure();
|
||||
|
||||
assertEquals(LogStructure.Format.CSV, structure.getFormat());
|
||||
assertEquals(charset, structure.getCharset());
|
||||
if (hasByteOrderMarker == null) {
|
||||
assertNull(structure.getHasByteOrderMarker());
|
||||
} else {
|
||||
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
|
||||
}
|
||||
assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," +
|
||||
"\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," +
|
||||
"\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?,\"?\"?,\"?\"?",
|
||||
structure.getExcludeLinesPattern());
|
||||
assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
|
||||
assertEquals(Character.valueOf(','), structure.getSeparator());
|
||||
assertTrue(structure.getHasHeaderRow());
|
||||
assertNull(structure.getShouldTrimFields());
|
||||
assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance",
|
||||
"RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax",
|
||||
"tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "column18", "column19"), structure.getInputFields());
|
||||
assertNull(structure.getGrokPattern());
|
||||
assertEquals("tpep_pickup_datetime", structure.getTimestampField());
|
||||
assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
|
||||
}
|
||||
|
||||
public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exception {
|
||||
String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," +
|
||||
"store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," +
|
||||
"improvement_surcharge,total_amount\n" +
|
||||
"2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
|
||||
"1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
|
||||
"1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
|
||||
assertTrue(factory.canCreateFromSample(explanation, sample));
|
||||
|
||||
String charset = randomFrom(POSSIBLE_CHARSETS);
|
||||
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
||||
LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
|
||||
|
||||
LogStructure structure = structureFinder.getStructure();
|
||||
|
||||
assertEquals(LogStructure.Format.CSV, structure.getFormat());
|
||||
assertEquals(charset, structure.getCharset());
|
||||
if (hasByteOrderMarker == null) {
|
||||
assertNull(structure.getHasByteOrderMarker());
|
||||
} else {
|
||||
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
|
||||
}
|
||||
assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," +
|
||||
"\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," +
|
||||
"\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?",
|
||||
structure.getExcludeLinesPattern());
|
||||
assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
|
||||
assertEquals(Character.valueOf(','), structure.getSeparator());
|
||||
assertTrue(structure.getHasHeaderRow());
|
||||
assertNull(structure.getShouldTrimFields());
|
||||
assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance",
|
||||
"RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax",
|
||||
"tip_amount", "tolls_amount", "improvement_surcharge", "total_amount"), structure.getInputFields());
|
||||
assertNull(structure.getGrokPattern());
|
||||
assertEquals("tpep_pickup_datetime", structure.getTimestampField());
|
||||
assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
|
||||
}
|
||||
|
||||
public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception {
|
||||
String sample = "\"pos_id\",\"trip_id\",\"latitude\",\"longitude\",\"altitude\",\"timestamp\"\n" +
|
||||
"\"1\",\"3\",\"4703.7815\",\"1527.4713\",\"359.9\",\"2017-01-19 16:19:04.742113\"\n" +
|
||||
"\"2\",\"3\",\"4703.7815\",\"1527.4714\",\"359.9\",\"2017-01-19 16:19:05.741890\"\n";
|
||||
assertTrue(factory.canCreateFromSample(explanation, sample));
|
||||
|
||||
String charset = randomFrom(POSSIBLE_CHARSETS);
|
||||
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
||||
LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
|
||||
|
||||
LogStructure structure = structureFinder.getStructure();
|
||||
|
||||
assertEquals(LogStructure.Format.CSV, structure.getFormat());
|
||||
assertEquals(charset, structure.getCharset());
|
||||
if (hasByteOrderMarker == null) {
|
||||
assertNull(structure.getHasByteOrderMarker());
|
||||
} else {
|
||||
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
|
||||
}
|
||||
assertEquals("^\"?pos_id\"?,\"?trip_id\"?,\"?latitude\"?,\"?longitude\"?,\"?altitude\"?,\"?timestamp\"?",
|
||||
structure.getExcludeLinesPattern());
|
||||
assertNull(structure.getMultilineStartPattern());
|
||||
assertEquals(Character.valueOf(','), structure.getSeparator());
|
||||
assertTrue(structure.getHasHeaderRow());
|
||||
assertNull(structure.getShouldTrimFields());
|
||||
assertEquals(Arrays.asList("pos_id", "trip_id", "latitude", "longitude", "altitude", "timestamp"), structure.getInputFields());
|
||||
assertNull(structure.getGrokPattern());
|
||||
assertEquals("timestamp", structure.getTimestampField());
|
||||
assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss.SSSSSS"), structure.getTimestampFormats());
|
||||
}
|
||||
|
||||
public void testFindHeaderFromSampleGivenHeaderInSample() throws IOException {
|
||||
String withHeader = "time,airline,responsetime,sourcetype\n" +
|
||||
"2014-06-23 00:00:00Z,AAL,132.2046,farequote\n" +
|
||||
"2014-06-23 00:00:00Z,JZA,990.4628,farequote\n" +
|
||||
"2014-06-23 00:00:01Z,JBU,877.5927,farequote\n" +
|
||||
"2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n";
|
||||
|
||||
Tuple<Boolean, String[]> header = SeparatedValuesLogStructureFinder.findHeaderFromSample(explanation,
|
||||
SeparatedValuesLogStructureFinder.readRows(withHeader, CsvPreference.EXCEL_PREFERENCE).v1());
|
||||
|
||||
assertTrue(header.v1());
|
||||
assertThat(header.v2(), arrayContaining("time", "airline", "responsetime", "sourcetype"));
|
||||
}
|
||||
|
||||
public void testFindHeaderFromSampleGivenHeaderNotInSample() throws IOException {
|
||||
String withoutHeader = "2014-06-23 00:00:00Z,AAL,132.2046,farequote\n" +
|
||||
"2014-06-23 00:00:00Z,JZA,990.4628,farequote\n" +
|
||||
"2014-06-23 00:00:01Z,JBU,877.5927,farequote\n" +
|
||||
"2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n";
|
||||
|
||||
Tuple<Boolean, String[]> header = SeparatedValuesLogStructureFinder.findHeaderFromSample(explanation,
|
||||
SeparatedValuesLogStructureFinder.readRows(withoutHeader, CsvPreference.EXCEL_PREFERENCE).v1());
|
||||
|
||||
assertFalse(header.v1());
|
||||
assertThat(header.v2(), arrayContaining("column1", "column2", "column3", "column4"));
|
||||
}
|
||||
|
||||
public void testLevenshteinDistance() {
|
||||
|
||||
assertEquals(0, levenshteinDistance("cat", "cat"));
|
||||
assertEquals(3, levenshteinDistance("cat", "dog"));
|
||||
assertEquals(5, levenshteinDistance("cat", "mouse"));
|
||||
assertEquals(3, levenshteinDistance("cat", ""));
|
||||
|
||||
assertEquals(3, levenshteinDistance("dog", "cat"));
|
||||
assertEquals(0, levenshteinDistance("dog", "dog"));
|
||||
assertEquals(4, levenshteinDistance("dog", "mouse"));
|
||||
assertEquals(3, levenshteinDistance("dog", ""));
|
||||
|
||||
assertEquals(5, levenshteinDistance("mouse", "cat"));
|
||||
assertEquals(4, levenshteinDistance("mouse", "dog"));
|
||||
assertEquals(0, levenshteinDistance("mouse", "mouse"));
|
||||
assertEquals(5, levenshteinDistance("mouse", ""));
|
||||
|
||||
assertEquals(3, levenshteinDistance("", "cat"));
|
||||
assertEquals(3, levenshteinDistance("", "dog"));
|
||||
assertEquals(5, levenshteinDistance("", "mouse"));
|
||||
assertEquals(0, levenshteinDistance("", ""));
|
||||
}
|
||||
|
||||
public void testLevenshteinCompareRows() {
|
||||
|
||||
assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "dog")));
|
||||
assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat")));
|
||||
assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat")));
|
||||
assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat")));
|
||||
assertEquals(5, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat")));
|
||||
assertEquals(4, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse")));
|
||||
assertEquals(7, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog")));
|
||||
}
|
||||
|
||||
public void testLineHasUnescapedQuote() {
|
||||
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,b,c", CsvPreference.EXCEL_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\",b,c", CsvPreference.EXCEL_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a,b\",c", CsvPreference.EXCEL_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a,b,c\"", CsvPreference.EXCEL_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,\"b\",c", CsvPreference.EXCEL_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,b,\"c\"", CsvPreference.EXCEL_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,\"b\"\"\",c", CsvPreference.EXCEL_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,b,\"c\"\"\"", CsvPreference.EXCEL_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"\"\"a\",b,c", CsvPreference.EXCEL_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\"\"\",b,c", CsvPreference.EXCEL_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a,\"\"b\",c", CsvPreference.EXCEL_PREFERENCE));
|
||||
assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("between\"words,b,c", CsvPreference.EXCEL_PREFERENCE));
|
||||
assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("x and \"y\",b,c", CsvPreference.EXCEL_PREFERENCE));
|
||||
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\tb\tc", CsvPreference.TAB_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\"\tb\tc", CsvPreference.TAB_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\tb\"\tc", CsvPreference.TAB_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\tb\tc\"", CsvPreference.TAB_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\t\"b\"\tc", CsvPreference.TAB_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\tb\t\"c\"", CsvPreference.TAB_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\t\"b\"\"\"\tc", CsvPreference.TAB_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\tb\t\"c\"\"\"", CsvPreference.TAB_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"\"\"a\"\tb\tc", CsvPreference.TAB_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\"\"\"\tb\tc", CsvPreference.TAB_PREFERENCE));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\t\"\"b\"\tc", CsvPreference.TAB_PREFERENCE));
|
||||
assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("between\"words\tb\tc", CsvPreference.TAB_PREFERENCE));
|
||||
assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("x and \"y\"\tb\tc", CsvPreference.TAB_PREFERENCE));
|
||||
}
|
||||
|
||||
public void testRowContainsDuplicateNonEmptyValues() {
|
||||
|
||||
assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList("a")));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList("")));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "c")));
|
||||
assertTrue(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "a")));
|
||||
assertTrue(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "b")));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "", "")));
|
||||
assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("", "a", "")));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
public class TextLogStructureFinderFactoryTests extends LogStructureTestCase {
|
||||
|
||||
private LogStructureFinderFactory factory = new TextLogStructureFinderFactory();
|
||||
|
||||
// No need to check JSON, XML, CSV, TSV, semi-colon separated values or pipe
|
||||
// separated values because they come earlier in the order we check formats
|
||||
|
||||
public void testCanCreateFromSampleGivenText() {
|
||||
|
||||
assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,245 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.common.util.set.Sets;
|
||||
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
public class TextLogStructureFinderTests extends LogStructureTestCase {
|
||||
|
||||
private LogStructureFinderFactory factory = new TextLogStructureFinderFactory();
|
||||
|
||||
public void testCreateConfigsGivenElasticsearchLog() throws Exception {
|
||||
assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
|
||||
|
||||
String charset = randomFrom(POSSIBLE_CHARSETS);
|
||||
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
||||
LogStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker);
|
||||
|
||||
LogStructure structure = structureFinder.getStructure();
|
||||
|
||||
assertEquals(LogStructure.Format.SEMI_STRUCTURED_TEXT, structure.getFormat());
|
||||
assertEquals(charset, structure.getCharset());
|
||||
if (hasByteOrderMarker == null) {
|
||||
assertNull(structure.getHasByteOrderMarker());
|
||||
} else {
|
||||
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
|
||||
}
|
||||
assertNull(structure.getExcludeLinesPattern());
|
||||
assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
|
||||
assertNull(structure.getSeparator());
|
||||
assertNull(structure.getHasHeaderRow());
|
||||
assertNull(structure.getShouldTrimFields());
|
||||
assertEquals("\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} \\]\\[.*", structure.getGrokPattern());
|
||||
assertEquals("timestamp", structure.getTimestampField());
|
||||
assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
|
||||
}
|
||||
|
||||
public void testCreateMultiLineMessageStartRegexGivenNoPrefaces() {
|
||||
for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
|
||||
String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
|
||||
assertEquals("^" + simpleDateRegex.replaceFirst("^\\\\b", ""),
|
||||
TextLogStructureFinder.createMultiLineMessageStartRegex(Collections.emptySet(), simpleDateRegex));
|
||||
}
|
||||
}
|
||||
|
||||
public void testCreateMultiLineMessageStartRegexGivenOneEmptyPreface() {
|
||||
for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
|
||||
String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
|
||||
assertEquals("^" + simpleDateRegex.replaceFirst("^\\\\b", ""),
|
||||
TextLogStructureFinder.createMultiLineMessageStartRegex(Collections.singleton(""), simpleDateRegex));
|
||||
}
|
||||
}
|
||||
|
||||
public void testCreateMultiLineMessageStartRegexGivenOneLogLevelPreface() {
|
||||
for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
|
||||
String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
|
||||
assertEquals("^\\[.*?\\] \\[" + simpleDateRegex,
|
||||
TextLogStructureFinder.createMultiLineMessageStartRegex(Collections.singleton("[ERROR] ["), simpleDateRegex));
|
||||
}
|
||||
}
|
||||
|
||||
public void testCreateMultiLineMessageStartRegexGivenManyLogLevelPrefaces() {
|
||||
for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
|
||||
Set<String> prefaces = Sets.newHashSet("[ERROR] [", "[DEBUG] [");
|
||||
String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
|
||||
assertEquals("^\\[.*?\\] \\[" + simpleDateRegex,
|
||||
TextLogStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex));
|
||||
}
|
||||
}
|
||||
|
||||
public void testCreateMultiLineMessageStartRegexGivenManyHostnamePrefaces() {
|
||||
for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
|
||||
Set<String> prefaces = Sets.newHashSet("host-1.acme.com|", "my_host.elastic.co|");
|
||||
String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
|
||||
assertEquals("^.*?\\|" + simpleDateRegex,
|
||||
TextLogStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex));
|
||||
}
|
||||
}
|
||||
|
||||
public void testCreateMultiLineMessageStartRegexGivenManyPrefacesIncludingEmpty() {
|
||||
for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
|
||||
Set<String> prefaces = Sets.newHashSet("", "[non-standard] ");
|
||||
String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
|
||||
assertEquals("^.*?" + simpleDateRegex,
|
||||
TextLogStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex));
|
||||
}
|
||||
}
|
||||
|
||||
public void testMostLikelyTimestampGivenAllSame() {
|
||||
String sample = "[2018-06-27T11:59:22,125][INFO ][o.e.n.Node ] [node-0] initializing ...\n" +
|
||||
"[2018-06-27T11:59:22,201][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], " +
|
||||
"net usable_space [216.1gb], net total_space [464.7gb], types [hfs]\n" +
|
||||
"[2018-06-27T11:59:22,202][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [494.9mb], " +
|
||||
"compressed ordinary object pointers [true]\n" +
|
||||
"[2018-06-27T11:59:22,204][INFO ][o.e.n.Node ] [node-0] node name [node-0], node ID [Ha1gD8nNSDqjd6PIyu3DJA]\n" +
|
||||
"[2018-06-27T11:59:22,204][INFO ][o.e.n.Node ] [node-0] version[6.4.0-SNAPSHOT], pid[2785], " +
|
||||
"build[default/zip/3c60efa/2018-06-26T14:55:15.206676Z], OS[Mac OS X/10.12.6/x86_64], " +
|
||||
"JVM[\"Oracle Corporation\"/Java HotSpot(TM) 64-Bit Server VM/10/10+46]\n" +
|
||||
"[2018-06-27T11:59:22,205][INFO ][o.e.n.Node ] [node-0] JVM arguments [-Xms1g, -Xmx1g, " +
|
||||
"-XX:+UseConcMarkSweepGC, -XX:CMSInitiatingOccupancyFraction=75, -XX:+UseCMSInitiatingOccupancyOnly, " +
|
||||
"-XX:+AlwaysPreTouch, -Xss1m, -Djava.awt.headless=true, -Dfile.encoding=UTF-8, -Djna.nosys=true, " +
|
||||
"-XX:-OmitStackTraceInFastThrow, -Dio.netty.noUnsafe=true, -Dio.netty.noKeySetOptimization=true, " +
|
||||
"-Dio.netty.recycler.maxCapacityPerThread=0, -Dlog4j.shutdownHookEnabled=false, -Dlog4j2.disable.jmx=true, " +
|
||||
"-Djava.io.tmpdir=/var/folders/k5/5sqcdlps5sg3cvlp783gcz740000h0/T/elasticsearch.nFUyeMH1, " +
|
||||
"-XX:+HeapDumpOnOutOfMemoryError, -XX:HeapDumpPath=data, -XX:ErrorFile=logs/hs_err_pid%p.log, " +
|
||||
"-Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m, " +
|
||||
"-Djava.locale.providers=COMPAT, -Dio.netty.allocator.type=unpooled, -ea, -esa, -Xms512m, -Xmx512m, " +
|
||||
"-Des.path.home=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT, " +
|
||||
"-Des.path.conf=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT/config, " +
|
||||
"-Des.distribution.flavor=default, -Des.distribution.type=zip]\n" +
|
||||
"[2018-06-27T11:59:22,205][WARN ][o.e.n.Node ] [node-0] version [6.4.0-SNAPSHOT] is a pre-release version of " +
|
||||
"Elasticsearch and is not suitable for production\n" +
|
||||
"[2018-06-27T11:59:23,585][INFO ][o.e.p.PluginsService ] [node-0] loaded module [aggs-matrix-stats]\n" +
|
||||
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [analysis-common]\n" +
|
||||
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [ingest-common]\n" +
|
||||
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-expression]\n" +
|
||||
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-mustache]\n" +
|
||||
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-painless]\n" +
|
||||
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [mapper-extras]\n" +
|
||||
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [parent-join]\n" +
|
||||
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [percolator]\n" +
|
||||
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [rank-eval]\n" +
|
||||
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [reindex]\n" +
|
||||
"[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [repository-url]\n" +
|
||||
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [transport-netty4]\n" +
|
||||
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-core]\n" +
|
||||
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-deprecation]\n" +
|
||||
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-graph]\n" +
|
||||
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-logstash]\n" +
|
||||
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-ml]\n" +
|
||||
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-monitoring]\n" +
|
||||
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-rollup]\n" +
|
||||
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-security]\n" +
|
||||
"[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-sql]\n" +
|
||||
"[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-upgrade]\n" +
|
||||
"[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-watcher]\n" +
|
||||
"[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] no plugins loaded\n";
|
||||
|
||||
Tuple<TimestampMatch, Set<String>> mostLikelyMatch = TextLogStructureFinder.mostLikelyTimestamp(sample.split("\n"));
|
||||
assertNotNull(mostLikelyMatch);
|
||||
assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""),
|
||||
mostLikelyMatch.v1());
|
||||
}
|
||||
|
||||
public void testMostLikelyTimestampGivenExceptionTrace() {
|
||||
String sample = "[2018-02-28T14:49:40,517][DEBUG][o.e.a.b.TransportShardBulkAction] [an_index][2] failed to execute bulk item " +
|
||||
"(index) BulkShardRequest [[an_index][2]] containing [33] requests\n" +
|
||||
"java.lang.IllegalArgumentException: Document contains at least one immense term in field=\"message.keyword\" (whose UTF8 " +
|
||||
"encoding is longer than the max length 32766), all of which were skipped. Please correct the analyzer to not produce " +
|
||||
"such terms. The prefix of the first immense term is: '[60, 83, 79, 65, 80, 45, 69, 78, 86, 58, 69, 110, 118, 101, 108, " +
|
||||
"111, 112, 101, 32, 120, 109, 108, 110, 115, 58, 83, 79, 65, 80, 45]...', original message: bytes can be at most 32766 " +
|
||||
"in length; got 49023\n" +
|
||||
"\tat org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) " +
|
||||
"~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
|
||||
"\tat org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430) " +
|
||||
"~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
|
||||
"\tat org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392) " +
|
||||
"~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
|
||||
"\tat org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:240) " +
|
||||
"~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
|
||||
"\tat org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:496) " +
|
||||
"~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
|
||||
"\tat org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1729) " +
|
||||
"~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
|
||||
"\tat org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1464) " +
|
||||
"~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
|
||||
"\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:1070) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:1012) " +
|
||||
"~[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:878) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:738) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:707) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperationOnPrimary(IndexShard.java:673) " +
|
||||
"~[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:548) " +
|
||||
"~[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequest(TransportShardBulkAction.java:140) " +
|
||||
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:236) " +
|
||||
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:123) " +
|
||||
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:110) " +
|
||||
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:72) " +
|
||||
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" +
|
||||
"(TransportReplicationAction.java:1034) [elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" +
|
||||
"(TransportReplicationAction.java:1012) [elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:103) " +
|
||||
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" +
|
||||
"(TransportReplicationAction.java:359) [elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" +
|
||||
"(TransportReplicationAction.java:299) [elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" +
|
||||
"(TransportReplicationAction.java:975) [elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" +
|
||||
"(TransportReplicationAction.java:972) [elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:238) " +
|
||||
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationPermit(IndexShard.java:2220) " +
|
||||
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference" +
|
||||
"(TransportReplicationAction.java:984) [elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction.access$500(TransportReplicationAction.java:98) " +
|
||||
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun" +
|
||||
"(TransportReplicationAction.java:320) [elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " +
|
||||
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" +
|
||||
".messageReceived(TransportReplicationAction.java:295) [elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" +
|
||||
".messageReceived(TransportReplicationAction.java:282) [elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) " +
|
||||
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:656) " +
|
||||
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:635) " +
|
||||
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " +
|
||||
"[elasticsearch-6.2.1.jar:6.2.1]\n" +
|
||||
"\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_144]\n" +
|
||||
"\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_144]\n" +
|
||||
"\tat java.lang.Thread.run(Thread.java:748) [?:1.8.0_144]\n";
|
||||
|
||||
Tuple<TimestampMatch, Set<String>> mostLikelyMatch = TextLogStructureFinder.mostLikelyTimestamp(sample.split("\n"));
|
||||
assertNotNull(mostLikelyMatch);
|
||||
|
||||
// Even though many lines have a timestamp near the end (in the Lucene version information),
|
||||
// these are so far along the lines that the weight of the timestamp near the beginning of the
|
||||
// first line should take precedence
|
||||
assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""),
|
||||
mostLikelyMatch.v1());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,242 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
|
||||
import org.joda.time.DateTime;
|
||||
import org.joda.time.DateTimeZone;
|
||||
import org.joda.time.format.DateTimeFormat;
|
||||
import org.joda.time.format.DateTimeFormatter;
|
||||
import org.joda.time.format.ISODateTimeFormat;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Locale;
|
||||
|
||||
public class TimestampFormatFinderTests extends LogStructureTestCase {
|
||||
|
||||
public void testFindFirstMatchGivenNoMatch() {
|
||||
|
||||
assertNull(TimestampFormatFinder.findFirstMatch(""));
|
||||
assertNull(TimestampFormatFinder.findFirstMatch("no timestamps in here"));
|
||||
assertNull(TimestampFormatFinder.findFirstMatch(":::"));
|
||||
assertNull(TimestampFormatFinder.findFirstMatch("/+"));
|
||||
}
|
||||
|
||||
public void testFindFirstMatchGivenOnlyIso8601() {
|
||||
|
||||
TimestampMatch expected = new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601",
|
||||
"");
|
||||
|
||||
checkAndValidateDateFormat(expected, "2018-05-15T16:14:56,374Z", 1526400896374L);
|
||||
checkAndValidateDateFormat(expected, "2018-05-15T17:14:56,374+0100", 1526400896374L);
|
||||
checkAndValidateDateFormat(expected, "2018-05-15T17:14:56,374+01:00", 1526400896374L);
|
||||
checkAndValidateDateFormat(expected, "2018-05-15T17:14:56,374", 1526400896374L);
|
||||
checkAndValidateDateFormat(expected, "2018-05-15T16:14:56Z", 1526400896000L);
|
||||
checkAndValidateDateFormat(expected, "2018-05-15T17:14:56+0100", 1526400896000L);
|
||||
checkAndValidateDateFormat(expected, "2018-05-15T17:14:56+01:00", 1526400896000L);
|
||||
checkAndValidateDateFormat(expected, "2018-05-15T17:14:56", 1526400896000L);
|
||||
|
||||
checkAndValidateDateFormat(new TimestampMatch(1, "", "YYYY-MM-dd HH:mm:ss,SSSZ",
|
||||
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 16:14:56,374Z",
|
||||
1526400896374L);
|
||||
checkAndValidateDateFormat(new TimestampMatch(1, "", "YYYY-MM-dd HH:mm:ss,SSSZ",
|
||||
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374+0100",
|
||||
1526400896374L);
|
||||
checkAndValidateDateFormat(new TimestampMatch(2, "", "YYYY-MM-dd HH:mm:ss,SSSZZ",
|
||||
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374+01:00",
|
||||
1526400896374L);
|
||||
checkAndValidateDateFormat(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss,SSS",
|
||||
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374", 1526400896374L);
|
||||
checkAndValidateDateFormat(new TimestampMatch(4, "", "YYYY-MM-dd HH:mm:ssZ",
|
||||
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 16:14:56Z", 1526400896000L);
|
||||
checkAndValidateDateFormat(new TimestampMatch(4, "", "YYYY-MM-dd HH:mm:ssZ",
|
||||
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56+0100", 1526400896000L);
|
||||
checkAndValidateDateFormat(new TimestampMatch(5, "", "YYYY-MM-dd HH:mm:ssZZ",
|
||||
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56+01:00", 1526400896000L);
|
||||
checkAndValidateDateFormat(new TimestampMatch(6, "", "YYYY-MM-dd HH:mm:ss",
|
||||
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56", 1526400896000L);
|
||||
}
|
||||
|
||||
public void testFindFirstMatchGivenOnlyKnownDateFormat() {
|
||||
|
||||
// Note: some of the time formats give millisecond accuracy, some second accuracy and some minute accuracy
|
||||
|
||||
checkAndValidateDateFormat(new TimestampMatch(0, "", "YYYY-MM-dd HH:mm:ss,SSS Z",
|
||||
"\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TOMCAT_DATESTAMP", ""), "2018-05-15 17:14:56,374 +0100",
|
||||
1526400896374L);
|
||||
|
||||
checkAndValidateDateFormat(new TimestampMatch(8, "", "EEE MMM dd YYYY HH:mm:ss zzz",
|
||||
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC822", ""),
|
||||
"Tue May 15 2018 16:14:56 UTC", 1526400896000L);
|
||||
checkAndValidateDateFormat(new TimestampMatch(9, "", "EEE MMM dd YYYY HH:mm zzz",
|
||||
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC822", ""),
|
||||
"Tue May 15 2018 16:14 UTC", 1526400840000L);
|
||||
|
||||
checkAndValidateDateFormat(new TimestampMatch(10, "", "EEE, dd MMM YYYY HH:mm:ss ZZ",
|
||||
"\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""),
|
||||
"Tue, 15 May 2018 17:14:56 +01:00", 1526400896000L);
|
||||
checkAndValidateDateFormat(new TimestampMatch(11, "", "EEE, dd MMM YYYY HH:mm:ss Z",
|
||||
"\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""),
|
||||
"Tue, 15 May 2018 17:14:56 +0100", 1526400896000L);
|
||||
checkAndValidateDateFormat(new TimestampMatch(12, "", "EEE, dd MMM YYYY HH:mm ZZ",
|
||||
"\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""),
|
||||
"Tue, 15 May 2018 17:14 +01:00", 1526400840000L);
|
||||
checkAndValidateDateFormat(new TimestampMatch(13, "", "EEE, dd MMM YYYY HH:mm Z",
|
||||
"\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""), "Tue, 15 May 2018 17:14 +0100",
|
||||
1526400840000L);
|
||||
|
||||
checkAndValidateDateFormat(new TimestampMatch(14, "", "EEE MMM dd HH:mm:ss zzz YYYY",
|
||||
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", "DATESTAMP_OTHER", ""),
|
||||
"Tue May 15 16:14:56 UTC 2018", 1526400896000L);
|
||||
checkAndValidateDateFormat(new TimestampMatch(15, "", "EEE MMM dd HH:mm zzz YYYY",
|
||||
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", "DATESTAMP_OTHER", ""),
|
||||
"Tue May 15 16:14 UTC 2018", 1526400840000L);
|
||||
|
||||
checkAndValidateDateFormat(new TimestampMatch(16, "", "YYYYMMddHHmmss", "\\b\\d{14}\\b", "DATESTAMP_EVENTLOG", ""),
|
||||
"20180515171456", 1526400896000L);
|
||||
|
||||
checkAndValidateDateFormat(new TimestampMatch(17, "", "EEE MMM dd HH:mm:ss YYYY",
|
||||
"\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", "HTTPDERROR_DATE", ""),
|
||||
"Tue May 15 17:14:56 2018", 1526400896000L);
|
||||
|
||||
checkAndValidateDateFormat(new TimestampMatch(18, "", Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"),
|
||||
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}", "SYSLOGTIMESTAMP", ""), "May 15 17:14:56.725", 1526400896725L);
|
||||
checkAndValidateDateFormat(new TimestampMatch(19, "", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"),
|
||||
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP", ""), "May 15 17:14:56", 1526400896000L);
|
||||
|
||||
checkAndValidateDateFormat(new TimestampMatch(20, "", "dd/MMM/YYYY:HH:mm:ss Z",
|
||||
"\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "HTTPDATE", ""), "15/May/2018:17:14:56 +0100", 1526400896000L);
|
||||
|
||||
checkAndValidateDateFormat(new TimestampMatch(21, "", "MMM dd, YYYY K:mm:ss a",
|
||||
"\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "CATALINA_DATESTAMP", ""), "May 15, 2018 5:14:56 PM",
|
||||
1526400896000L);
|
||||
|
||||
checkAndValidateDateFormat(new TimestampMatch(22, "", Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"),
|
||||
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", "CISCOTIMESTAMP", ""), "May 15 2018 17:14:56",
|
||||
1526400896000L);
|
||||
}
|
||||
|
||||
public void testFindFirstMatchGivenOnlySystemDate() {
|
||||
|
||||
assertEquals(new TimestampMatch(23, "", "UNIX_MS", "\\b\\d{13}\\b", "POSINT", ""),
|
||||
TimestampFormatFinder.findFirstMatch("1526400896374"));
|
||||
assertEquals(new TimestampMatch(23, "", "UNIX_MS", "\\b\\d{13}\\b", "POSINT", ""),
|
||||
TimestampFormatFinder.findFirstFullMatch("1526400896374"));
|
||||
|
||||
assertEquals(new TimestampMatch(24, "", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "NUMBER", ""),
|
||||
TimestampFormatFinder.findFirstMatch("1526400896.736"));
|
||||
assertEquals(new TimestampMatch(24, "", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "NUMBER", ""),
|
||||
TimestampFormatFinder.findFirstFullMatch("1526400896.736"));
|
||||
assertEquals(new TimestampMatch(25, "", "UNIX", "\\b\\d{10}\\b", "POSINT", ""),
|
||||
TimestampFormatFinder.findFirstMatch("1526400896"));
|
||||
assertEquals(new TimestampMatch(25, "", "UNIX", "\\b\\d{10}\\b", "POSINT", ""),
|
||||
TimestampFormatFinder.findFirstFullMatch("1526400896"));
|
||||
|
||||
assertEquals(new TimestampMatch(26, "", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM", ""),
|
||||
TimestampFormatFinder.findFirstMatch("400000005afb159a164ac980"));
|
||||
assertEquals(new TimestampMatch(26, "", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM", ""),
|
||||
TimestampFormatFinder.findFirstFullMatch("400000005afb159a164ac980"));
|
||||
}
|
||||
|
||||
private void checkAndValidateDateFormat(TimestampMatch expected, String text, long expectedEpochMs) {
|
||||
|
||||
assertEquals(expected, TimestampFormatFinder.findFirstMatch(text));
|
||||
assertEquals(expected, TimestampFormatFinder.findFirstFullMatch(text));
|
||||
|
||||
// All the test times are for Tue May 15 2018 16:14:56 UTC, which is 17:14:56 in London
|
||||
DateTimeZone zone = DateTimeZone.forID("Europe/London");
|
||||
DateTime parsed;
|
||||
for (int i = 0; i < expected.dateFormats.size(); ++i) {
|
||||
try {
|
||||
String dateFormat = expected.dateFormats.get(i);
|
||||
switch (dateFormat) {
|
||||
case "ISO8601":
|
||||
parsed = ISODateTimeFormat.dateTimeParser().withZone(zone).withDefaultYear(2018).parseDateTime(text);
|
||||
break;
|
||||
default:
|
||||
DateTimeFormatter parser = DateTimeFormat.forPattern(dateFormat).withZone(zone).withLocale(Locale.UK);
|
||||
parsed = parser.withDefaultYear(2018).parseDateTime(text);
|
||||
break;
|
||||
}
|
||||
if (expectedEpochMs == parsed.getMillis()) {
|
||||
break;
|
||||
}
|
||||
// If the last one isn't right then propagate
|
||||
if (i == expected.dateFormats.size() - 1) {
|
||||
assertEquals(expectedEpochMs, parsed.getMillis());
|
||||
}
|
||||
} catch (RuntimeException e) {
|
||||
// If the last one throws then propagate
|
||||
if (i == expected.dateFormats.size() - 1) {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
assertTrue(expected.simplePattern.matcher(text).find());
|
||||
}
|
||||
|
||||
public void testFindFirstMatchGivenRealLogMessages() {
|
||||
|
||||
assertEquals(new TimestampMatch(7, "[", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601",
|
||||
"][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [3.9gb], compressed ordinary object pointers [true]"),
|
||||
TimestampFormatFinder.findFirstMatch("[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment ] [node-0] " +
|
||||
"heap size [3.9gb], compressed ordinary object pointers [true]"));
|
||||
|
||||
assertEquals(new TimestampMatch(20, "192.168.62.101 - - [", "dd/MMM/YYYY:HH:mm:ss Z",
|
||||
"\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "HTTPDATE",
|
||||
"] \"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384"),
|
||||
TimestampFormatFinder.findFirstMatch("192.168.62.101 - - [29/Jun/2016:12:11:31 +0000] " +
|
||||
"\"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384"));
|
||||
|
||||
assertEquals(new TimestampMatch(21, "", "MMM dd, YYYY K:mm:ss a",
|
||||
"\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "CATALINA_DATESTAMP",
|
||||
" org.apache.tomcat.util.http.Parameters processParameters"),
|
||||
TimestampFormatFinder.findFirstMatch("Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters"));
|
||||
|
||||
assertEquals(new TimestampMatch(19, "", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"),
|
||||
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP", " esxi1.acme.com Vpxa: " +
|
||||
"[3CB3FB90 verbose 'vpxavpxaInvtVm' opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed"),
|
||||
TimestampFormatFinder.findFirstMatch("Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' " +
|
||||
"opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed"));
|
||||
|
||||
assertEquals(new TimestampMatch(7, "559550912540598297\t", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}",
|
||||
"TIMESTAMP_ISO8601",
|
||||
"\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp"),
|
||||
TimestampFormatFinder.findFirstMatch("559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t" +
|
||||
"192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp"));
|
||||
|
||||
assertEquals(new TimestampMatch(19, "", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"),
|
||||
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP",
|
||||
" dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53"),
|
||||
TimestampFormatFinder.findFirstMatch("Sep 8 11:55:35 dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving " +
|
||||
"'www.elastic.co/A/IN': 95.110.68.206#53"));
|
||||
|
||||
assertEquals(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss.SSSSSS", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}",
|
||||
"TIMESTAMP_ISO8601",
|
||||
"|INFO |VirtualServer |1 |client 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client " +
|
||||
"'User1'(id:2) in channel '3er Instanz'(id:2)"),
|
||||
TimestampFormatFinder.findFirstMatch("2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " +
|
||||
" 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)"));
|
||||
}
|
||||
|
||||
public void testInterpretFractionalSeconds() {
|
||||
assertEquals(new Tuple<>(',', 0), TimestampFormatFinder.interpretFractionalSeconds("Sep 8 11:55:35"));
|
||||
assertEquals(new Tuple<>(',', 0), TimestampFormatFinder.interpretFractionalSeconds("29/Jun/2016:12:11:31 +0000"));
|
||||
assertEquals(new Tuple<>('.', 6), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06 17:21:25.764368"));
|
||||
assertEquals(new Tuple<>(',', 9), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764363438"));
|
||||
assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764"));
|
||||
assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764"));
|
||||
assertEquals(new Tuple<>('.', 6), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06 17:21:25.764368Z"));
|
||||
assertEquals(new Tuple<>(',', 9), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764363438Z"));
|
||||
assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764Z"));
|
||||
assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764Z"));
|
||||
assertEquals(new Tuple<>('.', 6), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06 17:21:25.764368 Z"));
|
||||
assertEquals(new Tuple<>(',', 9), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764363438 Z"));
|
||||
assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764 Z"));
|
||||
assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764 Z"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
public class TsvLogStructureFinderFactoryTests extends LogStructureTestCase {
|
||||
|
||||
private LogStructureFinderFactory factory = new TsvLogStructureFinderFactory();
|
||||
|
||||
// No need to check JSON, XML or CSV because they come earlier in the order we check formats
|
||||
|
||||
public void testCanCreateFromSampleGivenTsv() {
|
||||
|
||||
assertTrue(factory.canCreateFromSample(explanation, TSV_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenPipeSeparatedValues() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenText() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
public class XmlLogStructureFinderFactoryTests extends LogStructureTestCase {
|
||||
|
||||
private LogStructureFinderFactory factory = new XmlLogStructureFinderFactory();
|
||||
|
||||
// No need to check JSON because it comes earlier in the order we check formats
|
||||
|
||||
public void testCanCreateFromSampleGivenXml() {
|
||||
|
||||
assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenCsv() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenTsv() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenPipeSeparatedValues() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
|
||||
}
|
||||
|
||||
public void testCanCreateFromSampleGivenText() {
|
||||
|
||||
assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.ml.logstructurefinder;
|
||||
|
||||
import java.util.Collections;
|
||||
|
||||
public class XmlLogStructureFinderTests extends LogStructureTestCase {
|
||||
|
||||
private LogStructureFinderFactory factory = new XmlLogStructureFinderFactory();
|
||||
|
||||
public void testCreateConfigsGivenGoodXml() throws Exception {
|
||||
assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE));
|
||||
|
||||
String charset = randomFrom(POSSIBLE_CHARSETS);
|
||||
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
||||
LogStructureFinder structureFinder = factory.createFromSample(explanation, XML_SAMPLE, charset, hasByteOrderMarker);
|
||||
|
||||
LogStructure structure = structureFinder.getStructure();
|
||||
|
||||
assertEquals(LogStructure.Format.XML, structure.getFormat());
|
||||
assertEquals(charset, structure.getCharset());
|
||||
if (hasByteOrderMarker == null) {
|
||||
assertNull(structure.getHasByteOrderMarker());
|
||||
} else {
|
||||
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
|
||||
}
|
||||
assertNull(structure.getExcludeLinesPattern());
|
||||
assertEquals("^\\s*<log4j:event", structure.getMultilineStartPattern());
|
||||
assertNull(structure.getSeparator());
|
||||
assertNull(structure.getHasHeaderRow());
|
||||
assertNull(structure.getShouldTrimFields());
|
||||
assertNull(structure.getGrokPattern());
|
||||
assertEquals("timestamp", structure.getTimestampField());
|
||||
assertEquals(Collections.singletonList("UNIX_MS"), structure.getTimestampFormats());
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue