From b61202b0a8508e5a52a164036dc9c3ec862e1557 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Mon, 3 Jun 2019 13:44:06 +0100 Subject: [PATCH] [ML] Add a limit on line merging in find_file_structure (#42501) When analysing a semi-structured text file the find_file_structure endpoint merges lines to form multi-line messages using the assumption that the first line in each message contains the timestamp. However, if the timestamp is misdetected then this can lead to excessive numbers of lines being merged to form massive messages. This commit adds a line_merge_size_limit setting (default 10000 characters) that halts the analysis if a message bigger than this is created. This prevents significant CPU time being spent subsequently trying to determine the internal structure of the huge bogus messages. --- .../client/ml/FindFileStructureRequest.java | 18 +++++++- .../ml/FindFileStructureRequestTests.java | 4 ++ .../ml/apis/find-file-structure.asciidoc | 7 +++ .../ml/action/FindFileStructureAction.java | 26 ++++++++++- .../FindFileStructureActionRequestTests.java | 16 +++++++ .../TransportFindFileStructureAction.java | 2 +- .../DelimitedFileStructureFinderFactory.java | 3 +- .../FileStructureFinderFactory.java | 4 +- .../FileStructureFinderManager.java | 44 +++++++++++-------- .../NdJsonFileStructureFinderFactory.java | 3 +- .../TextLogFileStructureFinder.java | 15 ++++++- .../TextLogFileStructureFinderFactory.java | 4 +- .../XmlFileStructureFinderFactory.java | 2 +- .../ml/rest/RestFindFileStructureAction.java | 2 + .../DelimitedFileStructureFinderTests.java | 26 +++++------ .../FileStructureFinderManagerTests.java | 26 +++++++---- .../NdJsonFileStructureFinderTests.java | 2 +- .../TextLogFileStructureFinderTests.java | 44 ++++++++++++++----- .../XmlFileStructureFinderTests.java | 2 +- .../api/ml.find_file_structure.json | 5 +++ .../test/ml/find_file_structure.yml | 1 + 21 files changed, 191 insertions(+), 65 deletions(-) diff --git a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/FindFileStructureRequest.java b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/FindFileStructureRequest.java index adfee92bd61..fed417e9582 100644 --- a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/FindFileStructureRequest.java +++ b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/FindFileStructureRequest.java @@ -37,6 +37,7 @@ import java.util.Optional; public class FindFileStructureRequest implements Validatable, ToXContentFragment { public static final ParseField LINES_TO_SAMPLE = new ParseField("lines_to_sample"); + public static final ParseField LINE_MERGE_SIZE_LIMIT = new ParseField("line_merge_size_limit"); public static final ParseField TIMEOUT = new ParseField("timeout"); public static final ParseField CHARSET = FileStructure.CHARSET; public static final ParseField FORMAT = FileStructure.FORMAT; @@ -52,6 +53,7 @@ public class FindFileStructureRequest implements Validatable, ToXContentFragment public static final ParseField EXPLAIN = new ParseField("explain"); private Integer linesToSample; + private Integer lineMergeSizeLimit; private TimeValue timeout; private String charset; private FileStructure.Format format; @@ -77,6 +79,14 @@ public class FindFileStructureRequest implements Validatable, ToXContentFragment this.linesToSample = linesToSample; } + public Integer getLineMergeSizeLimit() { + return lineMergeSizeLimit; + } + + public void setLineMergeSizeLimit(Integer lineMergeSizeLimit) { + this.lineMergeSizeLimit = lineMergeSizeLimit; + } + public TimeValue getTimeout() { return timeout; } @@ -228,6 +238,9 @@ public class FindFileStructureRequest implements Validatable, ToXContentFragment if (linesToSample != null) { builder.field(LINES_TO_SAMPLE.getPreferredName(), linesToSample); } + if (lineMergeSizeLimit != null) { + builder.field(LINE_MERGE_SIZE_LIMIT.getPreferredName(), lineMergeSizeLimit); + } if (timeout != null) { builder.field(TIMEOUT.getPreferredName(), timeout); } @@ -270,8 +283,8 @@ public class FindFileStructureRequest implements Validatable, ToXContentFragment @Override public int hashCode() { - return Objects.hash(linesToSample, timeout, charset, format, columnNames, hasHeaderRow, delimiter, grokPattern, timestampFormat, - timestampField, explain, sample); + return Objects.hash(linesToSample, lineMergeSizeLimit, timeout, charset, format, columnNames, hasHeaderRow, delimiter, grokPattern, + timestampFormat, timestampField, explain, sample); } @Override @@ -287,6 +300,7 @@ public class FindFileStructureRequest implements Validatable, ToXContentFragment FindFileStructureRequest that = (FindFileStructureRequest) other; return Objects.equals(this.linesToSample, that.linesToSample) && + Objects.equals(this.lineMergeSizeLimit, that.lineMergeSizeLimit) && Objects.equals(this.timeout, that.timeout) && Objects.equals(this.charset, that.charset) && Objects.equals(this.format, that.format) && diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/FindFileStructureRequestTests.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/FindFileStructureRequestTests.java index 4cb8bf0a7c1..752d0593bef 100644 --- a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/FindFileStructureRequestTests.java +++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/FindFileStructureRequestTests.java @@ -35,6 +35,7 @@ public class FindFileStructureRequestTests extends AbstractXContentTestCase p.setTimeout(TimeValue.parseTimeValue(c, FindFileStructureRequest.TIMEOUT.getPreferredName())), FindFileStructureRequest.TIMEOUT); PARSER.declareString(FindFileStructureRequest::setCharset, FindFileStructureRequest.CHARSET); @@ -72,6 +73,9 @@ public class FindFileStructureRequestTests extends AbstractXContentTestCase explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException { + int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker) + throws IOException { return DelimitedFileStructureFinder.makeDelimitedFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, csvPreference, trimFields, overrides, timeoutChecker); } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java index 8790b8f5268..45edf96ce56 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java @@ -37,6 +37,7 @@ public interface FileStructureFinderFactory { * @param sample A sample from the file to be ingested. * @param charsetName The name of the character set in which the sample was provided. * @param hasByteOrderMarker Did the sample have a byte order marker? null means "not relevant". + * @param lineMergeSizeLimit Maximum number of characters permitted when lines are merged to create messages. * @param overrides Stores structure decisions that have been made by the end user, and should * take precedence over anything the {@link FileStructureFinder} may decide. * @param timeoutChecker Will abort the operation if its timeout is exceeded. @@ -44,5 +45,6 @@ public interface FileStructureFinderFactory { * @throws Exception if something goes wrong during creation. */ FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws Exception; + int lineMergeSizeLimit, FileStructureOverrides overrides, + TimeoutChecker timeoutChecker) throws Exception; } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java index 4f26276c3df..2fa8d1bb6d6 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java @@ -43,6 +43,7 @@ public final class FileStructureFinderManager { public static final int MIN_SAMPLE_LINE_COUNT = 2; public static final int DEFAULT_IDEAL_SAMPLE_LINE_COUNT = 1000; + public static final int DEFAULT_LINE_MERGE_SIZE_LIMIT = 10000; static final Set FILEBEAT_SUPPORTED_ENCODINGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( "866", "ansi_x3.4-1968", "arabic", "ascii", "asmo-708", "big5", "big5-hkscs", "chinese", "cn-big5", "cp1250", "cp1251", "cp1252", @@ -96,8 +97,9 @@ public final class FileStructureFinderManager { this.scheduler = Objects.requireNonNull(scheduler); } - public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile) throws Exception { - return findFileStructure(idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, null); + public FileStructureFinder findFileStructure(Integer idealSampleLineCount, Integer lineMergeSizeLimit, + InputStream fromFile) throws Exception { + return findFileStructure(idealSampleLineCount, lineMergeSizeLimit, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, null); } /** @@ -106,6 +108,8 @@ public final class FileStructureFinderManager { * If the stream has fewer lines then an attempt will still be made, providing at * least {@link #MIN_SAMPLE_LINE_COUNT} lines can be read. If null * the value of {@link #DEFAULT_IDEAL_SAMPLE_LINE_COUNT} will be used. + * @param lineMergeSizeLimit Maximum number of characters permitted when lines are merged to create messages. + * If null the value of {@link #DEFAULT_LINE_MERGE_SIZE_LIMIT} will be used. * @param fromFile A stream from which the sample will be read. * @param overrides Aspects of the file structure that are known in advance. These take precedence over * values determined by structure analysis. An exception will be thrown if the file structure @@ -116,20 +120,21 @@ public final class FileStructureFinderManager { * @return A {@link FileStructureFinder} object from which the structure and messages can be queried. * @throws Exception A variety of problems could occur at various stages of the structure finding process. */ - public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile, FileStructureOverrides overrides, - TimeValue timeout) - throws Exception { - return findFileStructure(new ArrayList<>(), (idealSampleLineCount == null) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount, - fromFile, overrides, timeout); - } - - public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, InputStream fromFile) - throws Exception { - return findFileStructure(explanation, idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, null); - } - - public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, InputStream fromFile, + public FileStructureFinder findFileStructure(Integer idealSampleLineCount, Integer lineMergeSizeLimit, InputStream fromFile, FileStructureOverrides overrides, TimeValue timeout) throws Exception { + return findFileStructure(new ArrayList<>(), (idealSampleLineCount == null) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount, + (lineMergeSizeLimit == null) ? DEFAULT_LINE_MERGE_SIZE_LIMIT : lineMergeSizeLimit, fromFile, overrides, timeout); + } + + public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, int lineMergeSizeLimit, + InputStream fromFile) throws Exception { + return findFileStructure(explanation, idealSampleLineCount, lineMergeSizeLimit, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, + null); + } + + public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, int lineMergeSizeLimit, + InputStream fromFile, FileStructureOverrides overrides, + TimeValue timeout) throws Exception { try (TimeoutChecker timeoutChecker = new TimeoutChecker("structure analysis", timeout, scheduler)) { @@ -148,7 +153,8 @@ public final class FileStructureFinderManager { Tuple sampleInfo = sampleFile(sampleReader, charsetName, MIN_SAMPLE_LINE_COUNT, Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount), timeoutChecker); - return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2(), overrides, timeoutChecker); + return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2(), lineMergeSizeLimit, overrides, + timeoutChecker); } catch (Exception e) { // Add a dummy exception containing the explanation so far - this can be invaluable for troubleshooting as incorrect // decisions made early on in the structure analysis can result in seemingly crazy decisions or timeouts later on @@ -263,7 +269,8 @@ public final class FileStructureFinderManager { } FileStructureFinder makeBestStructureFinder(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws Exception { + int lineMergeSizeLimit, FileStructureOverrides overrides, + TimeoutChecker timeoutChecker) throws Exception { Character delimiter = overrides.getDelimiter(); Character quote = overrides.getQuote(); @@ -295,7 +302,8 @@ public final class FileStructureFinderManager { for (FileStructureFinderFactory factory : factories) { timeoutChecker.check("high level format detection"); if (factory.canCreateFromSample(explanation, sample)) { - return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, overrides, timeoutChecker); + return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, lineMergeSizeLimit, overrides, + timeoutChecker); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java index 43612890bc8..6970af01bb7 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java @@ -68,7 +68,8 @@ public class NdJsonFileStructureFinderFactory implements FileStructureFinderFact @Override public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException { + int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker) + throws IOException { return NdJsonFileStructureFinder.makeNdJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides, timeoutChecker); } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java index d07eea15f97..86b1d79b8b6 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java @@ -6,6 +6,7 @@ package org.elasticsearch.xpack.ml.filestructurefinder; import org.elasticsearch.common.collect.Tuple; +import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction; import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; @@ -24,8 +25,8 @@ public class TextLogFileStructureFinder implements FileStructureFinder { private final FileStructure structure; static TextLogFileStructureFinder makeTextLogFileStructureFinder(List explanation, String sample, String charsetName, - Boolean hasByteOrderMarker, FileStructureOverrides overrides, - TimeoutChecker timeoutChecker) { + Boolean hasByteOrderMarker, int lineMergeSizeLimit, + FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { String[] sampleLines = sample.split("\n"); TimestampFormatFinder timestampFormatFinder = populateTimestampFormatFinder(explanation, sampleLines, overrides, timeoutChecker); switch (timestampFormatFinder.getNumMatchedFormats()) { @@ -69,6 +70,16 @@ public class TextLogFileStructureFinder implements FileStructureFinder { // for the CSV header or lines before the first XML document starts) ++linesConsumed; } else { + // This check avoids subsequent problems when a massive message is unwieldy and slow to process + long lengthAfterAppend = message.length() + 1L + sampleLine.length(); + if (lengthAfterAppend > lineMergeSizeLimit) { + assert linesInMessage > 0; + throw new IllegalArgumentException("Merging lines into messages resulted in an unacceptably long message. " + + "Merged message would have [" + (linesInMessage + 1) + "] lines and [" + lengthAfterAppend + "] " + + "characters (limit [" + lineMergeSizeLimit + "]). If you have messages this big please increase " + + "the value of [" + FindFileStructureAction.Request.LINE_MERGE_SIZE_LIMIT + "]. Otherwise it " + + "probably means the timestamp has been incorrectly detected, so try overriding that."); + } message.append('\n').append(sampleLine); ++linesInMessage; } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java index 5931fea5f1a..2980d5d0678 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java @@ -41,8 +41,8 @@ public class TextLogFileStructureFinderFactory implements FileStructureFinderFac @Override public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { + int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { return TextLogFileStructureFinder.makeTextLogFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, - overrides, timeoutChecker); + lineMergeSizeLimit, overrides, timeoutChecker); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java index 97984d1d775..382f2e75027 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java @@ -125,7 +125,7 @@ public class XmlFileStructureFinderFactory implements FileStructureFinderFactory @Override public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) + int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException, ParserConfigurationException, SAXException { return XmlFileStructureFinder.makeXmlFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides, timeoutChecker); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java index 5810a2e929d..03c3fb2a39f 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java @@ -53,6 +53,8 @@ public class RestFindFileStructureAction extends BaseRestHandler { FindFileStructureAction.Request request = new FindFileStructureAction.Request(); request.setLinesToSample(restRequest.paramAsInt(FindFileStructureAction.Request.LINES_TO_SAMPLE.getPreferredName(), FileStructureFinderManager.DEFAULT_IDEAL_SAMPLE_LINE_COUNT)); + request.setLineMergeSizeLimit(restRequest.paramAsInt(FindFileStructureAction.Request.LINE_MERGE_SIZE_LIMIT.getPreferredName(), + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT)); request.setTimeout(TimeValue.parseTimeValue(restRequest.param(FindFileStructureAction.Request.TIMEOUT.getPreferredName()), DEFAULT_TIMEOUT, FindFileStructureAction.Request.TIMEOUT.getPreferredName())); request.setCharset(restRequest.param(FindFileStructureAction.Request.CHARSET.getPreferredName())); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java index 280a50324e4..7b157555eef 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java @@ -30,7 +30,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -64,8 +64,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides, - NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -101,8 +101,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides, - NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -135,7 +135,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -170,7 +170,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -214,8 +214,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides, - NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -255,7 +255,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -301,8 +301,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides, - NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -340,7 +340,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java index f68d8edc612..188bc9a628b 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java @@ -102,7 +102,8 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase { public void testMakeBestStructureGivenNdJson() throws Exception { assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(), - randomBoolean(), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(NdJsonFileStructureFinder.class)); + randomBoolean(), FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), + instanceOf(NdJsonFileStructureFinder.class)); } public void testMakeBestStructureGivenNdJsonAndDelimitedOverride() throws Exception { @@ -113,12 +114,14 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase { .setFormat(FileStructure.Format.DELIMITED).setQuote('\'').build(); assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(), - randomBoolean(), overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class)); + randomBoolean(), FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER), + instanceOf(DelimitedFileStructureFinder.class)); } public void testMakeBestStructureGivenXml() throws Exception { assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(XmlFileStructureFinder.class)); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), + instanceOf(XmlFileStructureFinder.class)); } public void testMakeBestStructureGivenXmlAndTextOverride() throws Exception { @@ -126,12 +129,14 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase { FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.SEMI_STRUCTURED_TEXT).build(); assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - overrides, NOOP_TIMEOUT_CHECKER), instanceOf(TextLogFileStructureFinder.class)); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER), + instanceOf(TextLogFileStructureFinder.class)); } public void testMakeBestStructureGivenCsv() throws Exception { assertThat(structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class)); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), + instanceOf(DelimitedFileStructureFinder.class)); } public void testMakeBestStructureGivenCsvAndJsonOverride() { @@ -140,14 +145,15 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase { IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - overrides, NOOP_TIMEOUT_CHECKER)); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER)); assertEquals("Input did not match the specified format [ndjson]", e.getMessage()); } public void testMakeBestStructureGivenText() throws Exception { assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(TextLogFileStructureFinder.class)); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), + instanceOf(TextLogFileStructureFinder.class)); } public void testMakeBestStructureGivenTextAndDelimitedOverride() throws Exception { @@ -157,7 +163,8 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase { .setFormat(FileStructure.Format.DELIMITED).setDelimiter(':').build(); assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class)); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER), + instanceOf(DelimitedFileStructureFinder.class)); } public void testFindFileStructureTimeout() throws IOException, InterruptedException { @@ -190,7 +197,8 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase { junkProducer.start(); ElasticsearchTimeoutException e = expectThrows(ElasticsearchTimeoutException.class, - () -> structureFinderManager.findFileStructure(explanation, linesOfJunk - 1, bigInput, EMPTY_OVERRIDES, timeout)); + () -> structureFinderManager.findFileStructure(explanation, FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + linesOfJunk - 1, bigInput, EMPTY_OVERRIDES, timeout)); assertThat(e.getMessage(), startsWith("Aborting structure analysis during [")); assertThat(e.getMessage(), endsWith("] as it has taken longer than the timeout of [" + timeout + "]")); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java index a220bdf3b06..048d2708e77 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java @@ -19,7 +19,7 @@ public class NdJsonFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = factory.createFromSample(explanation, NDJSON_SAMPLE, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java index 6ac672f6178..4c921c8a9f9 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java @@ -20,13 +20,36 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { private FileStructureFinderFactory factory = new TextLogFileStructureFinderFactory(); + public void testCreateConfigsGivenLowLineMergeSizeLimit() { + + String sample = "2019-05-16 16:56:14 line 1 abcdefghijklmnopqrstuvwxyz\n" + + "2019-05-16 16:56:14 line 2 abcdefghijklmnopqrstuvwxyz\n" + + "continuation line 2.1\n" + + "continuation line 2.2\n" + + "continuation line 2.3\n" + + "continuation line 2.4\n" + + "2019-05-16 16:56:14 line 3 abcdefghijklmnopqrstuvwxyz\n"; + + assertTrue(factory.canCreateFromSample(explanation, sample)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, 100, + FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER)); + + assertEquals("Merging lines into messages resulted in an unacceptably long message. Merged message would have [4] lines and " + + "[119] characters (limit [100]). If you have messages this big please increase the value of [line_merge_size_limit]. " + + "Otherwise it probably means the timestamp has been incorrectly detected, so try overriding that.", e.getMessage()); + } + public void testCreateConfigsGivenElasticsearchLog() throws Exception { assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -66,8 +89,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides, - NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -102,8 +125,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides, - NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -139,8 +162,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides, - NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -181,7 +204,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides, NOOP_TIMEOUT_CHECKER)); + () -> factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER)); assertEquals("Supplied Grok pattern [\\[%{LOGLEVEL:loglevel} *\\]\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] " + "\\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}] does not match sample messages", e.getMessage()); @@ -200,8 +224,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, FileStructureOverrides.EMPTY_OVERRIDES, - NOOP_TIMEOUT_CHECKER)); + () -> factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER)); assertEquals("Failed to create more than one message from the sample lines provided. (The last is discarded in " + "case the sample is incomplete.) If your sample does contain multiple messages the problem is probably that " diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java index b6f93a6e39b..9ad07f61427 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java @@ -19,7 +19,7 @@ public class XmlFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = factory.createFromSample(explanation, XML_SAMPLE, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/api/ml.find_file_structure.json b/x-pack/plugin/src/test/resources/rest-api-spec/api/ml.find_file_structure.json index 4e5550ae824..2f65a5d9749 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/api/ml.find_file_structure.json +++ b/x-pack/plugin/src/test/resources/rest-api-spec/api/ml.find_file_structure.json @@ -11,6 +11,11 @@ "description": "How many lines of the file should be included in the analysis", "default": 1000 }, + "line_merge_size_limit": { + "type": "int", + "description": "Maximum number of characters permitted in a single message when lines are merged to create messages.", + "default": 10000 + }, "timeout": { "type": "time", "description": "Timeout after which the analysis will be aborted", diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml index 7c6aff66e3d..a9634605aaa 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml @@ -10,6 +10,7 @@ setup: Content-Type: "application/json" ml.find_file_structure: lines_to_sample: 3 + line_merge_size_limit: 1234 timeout: 10s body: - airline: AAL