diff --git a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/FindFileStructureRequest.java b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/FindFileStructureRequest.java index adfee92bd61..fed417e9582 100644 --- a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/FindFileStructureRequest.java +++ b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/FindFileStructureRequest.java @@ -37,6 +37,7 @@ import java.util.Optional; public class FindFileStructureRequest implements Validatable, ToXContentFragment { public static final ParseField LINES_TO_SAMPLE = new ParseField("lines_to_sample"); + public static final ParseField LINE_MERGE_SIZE_LIMIT = new ParseField("line_merge_size_limit"); public static final ParseField TIMEOUT = new ParseField("timeout"); public static final ParseField CHARSET = FileStructure.CHARSET; public static final ParseField FORMAT = FileStructure.FORMAT; @@ -52,6 +53,7 @@ public class FindFileStructureRequest implements Validatable, ToXContentFragment public static final ParseField EXPLAIN = new ParseField("explain"); private Integer linesToSample; + private Integer lineMergeSizeLimit; private TimeValue timeout; private String charset; private FileStructure.Format format; @@ -77,6 +79,14 @@ public class FindFileStructureRequest implements Validatable, ToXContentFragment this.linesToSample = linesToSample; } + public Integer getLineMergeSizeLimit() { + return lineMergeSizeLimit; + } + + public void setLineMergeSizeLimit(Integer lineMergeSizeLimit) { + this.lineMergeSizeLimit = lineMergeSizeLimit; + } + public TimeValue getTimeout() { return timeout; } @@ -228,6 +238,9 @@ public class FindFileStructureRequest implements Validatable, ToXContentFragment if (linesToSample != null) { builder.field(LINES_TO_SAMPLE.getPreferredName(), linesToSample); } + if (lineMergeSizeLimit != null) { + builder.field(LINE_MERGE_SIZE_LIMIT.getPreferredName(), lineMergeSizeLimit); + } if (timeout != null) { builder.field(TIMEOUT.getPreferredName(), timeout); } @@ -270,8 +283,8 @@ public class FindFileStructureRequest implements Validatable, ToXContentFragment @Override public int hashCode() { - return Objects.hash(linesToSample, timeout, charset, format, columnNames, hasHeaderRow, delimiter, grokPattern, timestampFormat, - timestampField, explain, sample); + return Objects.hash(linesToSample, lineMergeSizeLimit, timeout, charset, format, columnNames, hasHeaderRow, delimiter, grokPattern, + timestampFormat, timestampField, explain, sample); } @Override @@ -287,6 +300,7 @@ public class FindFileStructureRequest implements Validatable, ToXContentFragment FindFileStructureRequest that = (FindFileStructureRequest) other; return Objects.equals(this.linesToSample, that.linesToSample) && + Objects.equals(this.lineMergeSizeLimit, that.lineMergeSizeLimit) && Objects.equals(this.timeout, that.timeout) && Objects.equals(this.charset, that.charset) && Objects.equals(this.format, that.format) && diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/FindFileStructureRequestTests.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/FindFileStructureRequestTests.java index 4cb8bf0a7c1..752d0593bef 100644 --- a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/FindFileStructureRequestTests.java +++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/FindFileStructureRequestTests.java @@ -35,6 +35,7 @@ public class FindFileStructureRequestTests extends AbstractXContentTestCase p.setTimeout(TimeValue.parseTimeValue(c, FindFileStructureRequest.TIMEOUT.getPreferredName())), FindFileStructureRequest.TIMEOUT); PARSER.declareString(FindFileStructureRequest::setCharset, FindFileStructureRequest.CHARSET); @@ -72,6 +73,9 @@ public class FindFileStructureRequestTests extends AbstractXContentTestCase explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException { + int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker) + throws IOException { return DelimitedFileStructureFinder.makeDelimitedFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, csvPreference, trimFields, overrides, timeoutChecker); } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java index 8790b8f5268..45edf96ce56 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java @@ -37,6 +37,7 @@ public interface FileStructureFinderFactory { * @param sample A sample from the file to be ingested. * @param charsetName The name of the character set in which the sample was provided. * @param hasByteOrderMarker Did the sample have a byte order marker? null means "not relevant". + * @param lineMergeSizeLimit Maximum number of characters permitted when lines are merged to create messages. * @param overrides Stores structure decisions that have been made by the end user, and should * take precedence over anything the {@link FileStructureFinder} may decide. * @param timeoutChecker Will abort the operation if its timeout is exceeded. @@ -44,5 +45,6 @@ public interface FileStructureFinderFactory { * @throws Exception if something goes wrong during creation. */ FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws Exception; + int lineMergeSizeLimit, FileStructureOverrides overrides, + TimeoutChecker timeoutChecker) throws Exception; } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java index 4f26276c3df..2fa8d1bb6d6 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java @@ -43,6 +43,7 @@ public final class FileStructureFinderManager { public static final int MIN_SAMPLE_LINE_COUNT = 2; public static final int DEFAULT_IDEAL_SAMPLE_LINE_COUNT = 1000; + public static final int DEFAULT_LINE_MERGE_SIZE_LIMIT = 10000; static final Set FILEBEAT_SUPPORTED_ENCODINGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( "866", "ansi_x3.4-1968", "arabic", "ascii", "asmo-708", "big5", "big5-hkscs", "chinese", "cn-big5", "cp1250", "cp1251", "cp1252", @@ -96,8 +97,9 @@ public final class FileStructureFinderManager { this.scheduler = Objects.requireNonNull(scheduler); } - public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile) throws Exception { - return findFileStructure(idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, null); + public FileStructureFinder findFileStructure(Integer idealSampleLineCount, Integer lineMergeSizeLimit, + InputStream fromFile) throws Exception { + return findFileStructure(idealSampleLineCount, lineMergeSizeLimit, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, null); } /** @@ -106,6 +108,8 @@ public final class FileStructureFinderManager { * If the stream has fewer lines then an attempt will still be made, providing at * least {@link #MIN_SAMPLE_LINE_COUNT} lines can be read. If null * the value of {@link #DEFAULT_IDEAL_SAMPLE_LINE_COUNT} will be used. + * @param lineMergeSizeLimit Maximum number of characters permitted when lines are merged to create messages. + * If null the value of {@link #DEFAULT_LINE_MERGE_SIZE_LIMIT} will be used. * @param fromFile A stream from which the sample will be read. * @param overrides Aspects of the file structure that are known in advance. These take precedence over * values determined by structure analysis. An exception will be thrown if the file structure @@ -116,20 +120,21 @@ public final class FileStructureFinderManager { * @return A {@link FileStructureFinder} object from which the structure and messages can be queried. * @throws Exception A variety of problems could occur at various stages of the structure finding process. */ - public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile, FileStructureOverrides overrides, - TimeValue timeout) - throws Exception { - return findFileStructure(new ArrayList<>(), (idealSampleLineCount == null) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount, - fromFile, overrides, timeout); - } - - public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, InputStream fromFile) - throws Exception { - return findFileStructure(explanation, idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, null); - } - - public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, InputStream fromFile, + public FileStructureFinder findFileStructure(Integer idealSampleLineCount, Integer lineMergeSizeLimit, InputStream fromFile, FileStructureOverrides overrides, TimeValue timeout) throws Exception { + return findFileStructure(new ArrayList<>(), (idealSampleLineCount == null) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount, + (lineMergeSizeLimit == null) ? DEFAULT_LINE_MERGE_SIZE_LIMIT : lineMergeSizeLimit, fromFile, overrides, timeout); + } + + public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, int lineMergeSizeLimit, + InputStream fromFile) throws Exception { + return findFileStructure(explanation, idealSampleLineCount, lineMergeSizeLimit, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, + null); + } + + public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, int lineMergeSizeLimit, + InputStream fromFile, FileStructureOverrides overrides, + TimeValue timeout) throws Exception { try (TimeoutChecker timeoutChecker = new TimeoutChecker("structure analysis", timeout, scheduler)) { @@ -148,7 +153,8 @@ public final class FileStructureFinderManager { Tuple sampleInfo = sampleFile(sampleReader, charsetName, MIN_SAMPLE_LINE_COUNT, Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount), timeoutChecker); - return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2(), overrides, timeoutChecker); + return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2(), lineMergeSizeLimit, overrides, + timeoutChecker); } catch (Exception e) { // Add a dummy exception containing the explanation so far - this can be invaluable for troubleshooting as incorrect // decisions made early on in the structure analysis can result in seemingly crazy decisions or timeouts later on @@ -263,7 +269,8 @@ public final class FileStructureFinderManager { } FileStructureFinder makeBestStructureFinder(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws Exception { + int lineMergeSizeLimit, FileStructureOverrides overrides, + TimeoutChecker timeoutChecker) throws Exception { Character delimiter = overrides.getDelimiter(); Character quote = overrides.getQuote(); @@ -295,7 +302,8 @@ public final class FileStructureFinderManager { for (FileStructureFinderFactory factory : factories) { timeoutChecker.check("high level format detection"); if (factory.canCreateFromSample(explanation, sample)) { - return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, overrides, timeoutChecker); + return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, lineMergeSizeLimit, overrides, + timeoutChecker); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java index 43612890bc8..6970af01bb7 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java @@ -68,7 +68,8 @@ public class NdJsonFileStructureFinderFactory implements FileStructureFinderFact @Override public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException { + int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker) + throws IOException { return NdJsonFileStructureFinder.makeNdJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides, timeoutChecker); } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java index d07eea15f97..86b1d79b8b6 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java @@ -6,6 +6,7 @@ package org.elasticsearch.xpack.ml.filestructurefinder; import org.elasticsearch.common.collect.Tuple; +import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction; import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; @@ -24,8 +25,8 @@ public class TextLogFileStructureFinder implements FileStructureFinder { private final FileStructure structure; static TextLogFileStructureFinder makeTextLogFileStructureFinder(List explanation, String sample, String charsetName, - Boolean hasByteOrderMarker, FileStructureOverrides overrides, - TimeoutChecker timeoutChecker) { + Boolean hasByteOrderMarker, int lineMergeSizeLimit, + FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { String[] sampleLines = sample.split("\n"); TimestampFormatFinder timestampFormatFinder = populateTimestampFormatFinder(explanation, sampleLines, overrides, timeoutChecker); switch (timestampFormatFinder.getNumMatchedFormats()) { @@ -69,6 +70,16 @@ public class TextLogFileStructureFinder implements FileStructureFinder { // for the CSV header or lines before the first XML document starts) ++linesConsumed; } else { + // This check avoids subsequent problems when a massive message is unwieldy and slow to process + long lengthAfterAppend = message.length() + 1L + sampleLine.length(); + if (lengthAfterAppend > lineMergeSizeLimit) { + assert linesInMessage > 0; + throw new IllegalArgumentException("Merging lines into messages resulted in an unacceptably long message. " + + "Merged message would have [" + (linesInMessage + 1) + "] lines and [" + lengthAfterAppend + "] " + + "characters (limit [" + lineMergeSizeLimit + "]). If you have messages this big please increase " + + "the value of [" + FindFileStructureAction.Request.LINE_MERGE_SIZE_LIMIT + "]. Otherwise it " + + "probably means the timestamp has been incorrectly detected, so try overriding that."); + } message.append('\n').append(sampleLine); ++linesInMessage; } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java index 5931fea5f1a..2980d5d0678 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java @@ -41,8 +41,8 @@ public class TextLogFileStructureFinderFactory implements FileStructureFinderFac @Override public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { + int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { return TextLogFileStructureFinder.makeTextLogFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, - overrides, timeoutChecker); + lineMergeSizeLimit, overrides, timeoutChecker); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java index 97984d1d775..382f2e75027 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java @@ -125,7 +125,7 @@ public class XmlFileStructureFinderFactory implements FileStructureFinderFactory @Override public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) + int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException, ParserConfigurationException, SAXException { return XmlFileStructureFinder.makeXmlFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides, timeoutChecker); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java index 5810a2e929d..03c3fb2a39f 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java @@ -53,6 +53,8 @@ public class RestFindFileStructureAction extends BaseRestHandler { FindFileStructureAction.Request request = new FindFileStructureAction.Request(); request.setLinesToSample(restRequest.paramAsInt(FindFileStructureAction.Request.LINES_TO_SAMPLE.getPreferredName(), FileStructureFinderManager.DEFAULT_IDEAL_SAMPLE_LINE_COUNT)); + request.setLineMergeSizeLimit(restRequest.paramAsInt(FindFileStructureAction.Request.LINE_MERGE_SIZE_LIMIT.getPreferredName(), + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT)); request.setTimeout(TimeValue.parseTimeValue(restRequest.param(FindFileStructureAction.Request.TIMEOUT.getPreferredName()), DEFAULT_TIMEOUT, FindFileStructureAction.Request.TIMEOUT.getPreferredName())); request.setCharset(restRequest.param(FindFileStructureAction.Request.CHARSET.getPreferredName())); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java index 280a50324e4..7b157555eef 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java @@ -30,7 +30,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -64,8 +64,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides, - NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -101,8 +101,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides, - NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -135,7 +135,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -170,7 +170,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -214,8 +214,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides, - NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -255,7 +255,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -301,8 +301,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides, - NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -340,7 +340,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java index f68d8edc612..188bc9a628b 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java @@ -102,7 +102,8 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase { public void testMakeBestStructureGivenNdJson() throws Exception { assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(), - randomBoolean(), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(NdJsonFileStructureFinder.class)); + randomBoolean(), FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), + instanceOf(NdJsonFileStructureFinder.class)); } public void testMakeBestStructureGivenNdJsonAndDelimitedOverride() throws Exception { @@ -113,12 +114,14 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase { .setFormat(FileStructure.Format.DELIMITED).setQuote('\'').build(); assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(), - randomBoolean(), overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class)); + randomBoolean(), FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER), + instanceOf(DelimitedFileStructureFinder.class)); } public void testMakeBestStructureGivenXml() throws Exception { assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(XmlFileStructureFinder.class)); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), + instanceOf(XmlFileStructureFinder.class)); } public void testMakeBestStructureGivenXmlAndTextOverride() throws Exception { @@ -126,12 +129,14 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase { FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.SEMI_STRUCTURED_TEXT).build(); assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - overrides, NOOP_TIMEOUT_CHECKER), instanceOf(TextLogFileStructureFinder.class)); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER), + instanceOf(TextLogFileStructureFinder.class)); } public void testMakeBestStructureGivenCsv() throws Exception { assertThat(structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class)); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), + instanceOf(DelimitedFileStructureFinder.class)); } public void testMakeBestStructureGivenCsvAndJsonOverride() { @@ -140,14 +145,15 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase { IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - overrides, NOOP_TIMEOUT_CHECKER)); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER)); assertEquals("Input did not match the specified format [ndjson]", e.getMessage()); } public void testMakeBestStructureGivenText() throws Exception { assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(TextLogFileStructureFinder.class)); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), + instanceOf(TextLogFileStructureFinder.class)); } public void testMakeBestStructureGivenTextAndDelimitedOverride() throws Exception { @@ -157,7 +163,8 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase { .setFormat(FileStructure.Format.DELIMITED).setDelimiter(':').build(); assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class)); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER), + instanceOf(DelimitedFileStructureFinder.class)); } public void testFindFileStructureTimeout() throws IOException, InterruptedException { @@ -190,7 +197,8 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase { junkProducer.start(); ElasticsearchTimeoutException e = expectThrows(ElasticsearchTimeoutException.class, - () -> structureFinderManager.findFileStructure(explanation, linesOfJunk - 1, bigInput, EMPTY_OVERRIDES, timeout)); + () -> structureFinderManager.findFileStructure(explanation, FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + linesOfJunk - 1, bigInput, EMPTY_OVERRIDES, timeout)); assertThat(e.getMessage(), startsWith("Aborting structure analysis during [")); assertThat(e.getMessage(), endsWith("] as it has taken longer than the timeout of [" + timeout + "]")); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java index a220bdf3b06..048d2708e77 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java @@ -19,7 +19,7 @@ public class NdJsonFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = factory.createFromSample(explanation, NDJSON_SAMPLE, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java index 6ac672f6178..4c921c8a9f9 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java @@ -20,13 +20,36 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { private FileStructureFinderFactory factory = new TextLogFileStructureFinderFactory(); + public void testCreateConfigsGivenLowLineMergeSizeLimit() { + + String sample = "2019-05-16 16:56:14 line 1 abcdefghijklmnopqrstuvwxyz\n" + + "2019-05-16 16:56:14 line 2 abcdefghijklmnopqrstuvwxyz\n" + + "continuation line 2.1\n" + + "continuation line 2.2\n" + + "continuation line 2.3\n" + + "continuation line 2.4\n" + + "2019-05-16 16:56:14 line 3 abcdefghijklmnopqrstuvwxyz\n"; + + assertTrue(factory.canCreateFromSample(explanation, sample)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, 100, + FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER)); + + assertEquals("Merging lines into messages resulted in an unacceptably long message. Merged message would have [4] lines and " + + "[119] characters (limit [100]). If you have messages this big please increase the value of [line_merge_size_limit]. " + + "Otherwise it probably means the timestamp has been incorrectly detected, so try overriding that.", e.getMessage()); + } + public void testCreateConfigsGivenElasticsearchLog() throws Exception { assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -66,8 +89,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides, - NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -102,8 +125,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides, - NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -139,8 +162,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides, - NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -181,7 +204,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides, NOOP_TIMEOUT_CHECKER)); + () -> factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER)); assertEquals("Supplied Grok pattern [\\[%{LOGLEVEL:loglevel} *\\]\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] " + "\\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}] does not match sample messages", e.getMessage()); @@ -200,8 +224,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, FileStructureOverrides.EMPTY_OVERRIDES, - NOOP_TIMEOUT_CHECKER)); + () -> factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER)); assertEquals("Failed to create more than one message from the sample lines provided. (The last is discarded in " + "case the sample is incomplete.) If your sample does contain multiple messages the problem is probably that " diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java index b6f93a6e39b..9ad07f61427 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java @@ -19,7 +19,7 @@ public class XmlFileStructureFinderTests extends FileStructureTestCase { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = factory.createFromSample(explanation, XML_SAMPLE, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/api/ml.find_file_structure.json b/x-pack/plugin/src/test/resources/rest-api-spec/api/ml.find_file_structure.json index 4e5550ae824..2f65a5d9749 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/api/ml.find_file_structure.json +++ b/x-pack/plugin/src/test/resources/rest-api-spec/api/ml.find_file_structure.json @@ -11,6 +11,11 @@ "description": "How many lines of the file should be included in the analysis", "default": 1000 }, + "line_merge_size_limit": { + "type": "int", + "description": "Maximum number of characters permitted in a single message when lines are merged to create messages.", + "default": 10000 + }, "timeout": { "type": "time", "description": "Timeout after which the analysis will be aborted", diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml index 7c6aff66e3d..a9634605aaa 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml @@ -10,6 +10,7 @@ setup: Content-Type: "application/json" ml.find_file_structure: lines_to_sample: 3 + line_merge_size_limit: 1234 timeout: 10s body: - airline: AAL