[ML] Add a limit on line merging in find_file_structure (#42501)

When analysing a semi-structured text file the
find_file_structure endpoint merges lines to form
multi-line messages using the assumption that the
first line in each message contains the timestamp.
However, if the timestamp is misdetected then this
can lead to excessive numbers of lines being merged
to form massive messages.

This commit adds a line_merge_size_limit setting
(default 10000 characters) that halts the analysis
if a message bigger than this is created.  This
prevents significant CPU time being spent subsequently
trying to determine the internal structure of the
huge bogus messages.
This commit is contained in:
David Roberts 2019-06-03 13:44:06 +01:00
parent 0253927ec4
commit b61202b0a8
21 changed files with 191 additions and 65 deletions

View File

@ -37,6 +37,7 @@ import java.util.Optional;
public class FindFileStructureRequest implements Validatable, ToXContentFragment {
public static final ParseField LINES_TO_SAMPLE = new ParseField("lines_to_sample");
public static final ParseField LINE_MERGE_SIZE_LIMIT = new ParseField("line_merge_size_limit");
public static final ParseField TIMEOUT = new ParseField("timeout");
public static final ParseField CHARSET = FileStructure.CHARSET;
public static final ParseField FORMAT = FileStructure.FORMAT;
@ -52,6 +53,7 @@ public class FindFileStructureRequest implements Validatable, ToXContentFragment
public static final ParseField EXPLAIN = new ParseField("explain");
private Integer linesToSample;
private Integer lineMergeSizeLimit;
private TimeValue timeout;
private String charset;
private FileStructure.Format format;
@ -77,6 +79,14 @@ public class FindFileStructureRequest implements Validatable, ToXContentFragment
this.linesToSample = linesToSample;
}
public Integer getLineMergeSizeLimit() {
return lineMergeSizeLimit;
}
public void setLineMergeSizeLimit(Integer lineMergeSizeLimit) {
this.lineMergeSizeLimit = lineMergeSizeLimit;
}
public TimeValue getTimeout() {
return timeout;
}
@ -228,6 +238,9 @@ public class FindFileStructureRequest implements Validatable, ToXContentFragment
if (linesToSample != null) {
builder.field(LINES_TO_SAMPLE.getPreferredName(), linesToSample);
}
if (lineMergeSizeLimit != null) {
builder.field(LINE_MERGE_SIZE_LIMIT.getPreferredName(), lineMergeSizeLimit);
}
if (timeout != null) {
builder.field(TIMEOUT.getPreferredName(), timeout);
}
@ -270,8 +283,8 @@ public class FindFileStructureRequest implements Validatable, ToXContentFragment
@Override
public int hashCode() {
return Objects.hash(linesToSample, timeout, charset, format, columnNames, hasHeaderRow, delimiter, grokPattern, timestampFormat,
timestampField, explain, sample);
return Objects.hash(linesToSample, lineMergeSizeLimit, timeout, charset, format, columnNames, hasHeaderRow, delimiter, grokPattern,
timestampFormat, timestampField, explain, sample);
}
@Override
@ -287,6 +300,7 @@ public class FindFileStructureRequest implements Validatable, ToXContentFragment
FindFileStructureRequest that = (FindFileStructureRequest) other;
return Objects.equals(this.linesToSample, that.linesToSample) &&
Objects.equals(this.lineMergeSizeLimit, that.lineMergeSizeLimit) &&
Objects.equals(this.timeout, that.timeout) &&
Objects.equals(this.charset, that.charset) &&
Objects.equals(this.format, that.format) &&

View File

@ -35,6 +35,7 @@ public class FindFileStructureRequestTests extends AbstractXContentTestCase<Find
static {
PARSER.declareInt(FindFileStructureRequest::setLinesToSample, FindFileStructureRequest.LINES_TO_SAMPLE);
PARSER.declareInt(FindFileStructureRequest::setLineMergeSizeLimit, FindFileStructureRequest.LINE_MERGE_SIZE_LIMIT);
PARSER.declareString((p, c) -> p.setTimeout(TimeValue.parseTimeValue(c, FindFileStructureRequest.TIMEOUT.getPreferredName())),
FindFileStructureRequest.TIMEOUT);
PARSER.declareString(FindFileStructureRequest::setCharset, FindFileStructureRequest.CHARSET);
@ -72,6 +73,9 @@ public class FindFileStructureRequestTests extends AbstractXContentTestCase<Find
if (randomBoolean()) {
findFileStructureRequest.setLinesToSample(randomIntBetween(1000, 2000));
}
if (randomBoolean()) {
findFileStructureRequest.setLineMergeSizeLimit(randomIntBetween(10000, 20000));
}
if (randomBoolean()) {
findFileStructureRequest.setTimeout(TimeValue.timeValueSeconds(randomIntBetween(10, 20)));
}

View File

@ -92,6 +92,13 @@ chosen.
parameter is not specified, the structure finder guesses based on the similarity of
the first row of the file to other rows.
`line_merge_size_limit`::
(unsigned integer) The maximum number of characters in a message when lines are
merged to form messages while analyzing semi-structured files. The default
is 10000. If you have extremely long messages you may need to increase this, but
be aware that this may lead to very long processing times if the way to group
lines into messages is misdetected.
`lines_to_sample`::
(unsigned integer) The number of lines to include in the structural analysis,
starting from the beginning of the file. The minimum is 2; the default

View File

@ -5,6 +5,7 @@
*/
package org.elasticsearch.xpack.core.ml.action;
import org.elasticsearch.Version;
import org.elasticsearch.action.Action;
import org.elasticsearch.action.ActionRequest;
import org.elasticsearch.action.ActionRequestBuilder;
@ -113,6 +114,7 @@ public class FindFileStructureAction extends Action<FindFileStructureAction.Resp
public static class Request extends ActionRequest {
public static final ParseField LINES_TO_SAMPLE = new ParseField("lines_to_sample");
public static final ParseField LINE_MERGE_SIZE_LIMIT = new ParseField("line_merge_size_limit");
public static final ParseField TIMEOUT = new ParseField("timeout");
public static final ParseField CHARSET = FileStructure.CHARSET;
public static final ParseField FORMAT = FileStructure.FORMAT;
@ -130,6 +132,7 @@ public class FindFileStructureAction extends Action<FindFileStructureAction.Resp
"[%s] may only be specified if [" + FORMAT.getPreferredName() + "] is [%s]";
private Integer linesToSample;
private Integer lineMergeSizeLimit;
private TimeValue timeout;
private String charset;
private FileStructure.Format format;
@ -154,6 +157,14 @@ public class FindFileStructureAction extends Action<FindFileStructureAction.Resp
this.linesToSample = linesToSample;
}
public Integer getLineMergeSizeLimit() {
return lineMergeSizeLimit;
}
public void setLineMergeSizeLimit(Integer lineMergeSizeLimit) {
this.lineMergeSizeLimit = lineMergeSizeLimit;
}
public TimeValue getTimeout() {
return timeout;
}
@ -291,6 +302,10 @@ public class FindFileStructureAction extends Action<FindFileStructureAction.Resp
validationException =
addValidationError("[" + LINES_TO_SAMPLE.getPreferredName() + "] must be positive if specified", validationException);
}
if (lineMergeSizeLimit != null && lineMergeSizeLimit <= 0) {
validationException = addValidationError("[" + LINE_MERGE_SIZE_LIMIT.getPreferredName() + "] must be positive if specified",
validationException);
}
if (format != FileStructure.Format.DELIMITED) {
if (columnNames != null) {
validationException = addIncompatibleArgError(COLUMN_NAMES, FileStructure.Format.DELIMITED, validationException);
@ -324,6 +339,9 @@ public class FindFileStructureAction extends Action<FindFileStructureAction.Resp
public void readFrom(StreamInput in) throws IOException {
super.readFrom(in);
linesToSample = in.readOptionalVInt();
if (in.getVersion().onOrAfter(Version.CURRENT)) {
lineMergeSizeLimit = in.readOptionalVInt();
}
timeout = in.readOptionalTimeValue();
charset = in.readOptionalString();
format = in.readBoolean() ? in.readEnum(FileStructure.Format.class) : null;
@ -342,6 +360,9 @@ public class FindFileStructureAction extends Action<FindFileStructureAction.Resp
public void writeTo(StreamOutput out) throws IOException {
super.writeTo(out);
out.writeOptionalVInt(linesToSample);
if (out.getVersion().onOrAfter(Version.CURRENT)) {
out.writeOptionalVInt(lineMergeSizeLimit);
}
out.writeOptionalTimeValue(timeout);
out.writeOptionalString(charset);
if (format == null) {
@ -378,8 +399,8 @@ public class FindFileStructureAction extends Action<FindFileStructureAction.Resp
@Override
public int hashCode() {
return Objects.hash(linesToSample, timeout, charset, format, columnNames, hasHeaderRow, delimiter, grokPattern, timestampFormat,
timestampField, sample);
return Objects.hash(linesToSample, lineMergeSizeLimit, timeout, charset, format, columnNames, hasHeaderRow, delimiter,
grokPattern, timestampFormat, timestampField, sample);
}
@Override
@ -395,6 +416,7 @@ public class FindFileStructureAction extends Action<FindFileStructureAction.Resp
Request that = (Request) other;
return Objects.equals(this.linesToSample, that.linesToSample) &&
Objects.equals(this.lineMergeSizeLimit, that.lineMergeSizeLimit) &&
Objects.equals(this.timeout, that.timeout) &&
Objects.equals(this.charset, that.charset) &&
Objects.equals(this.format, that.format) &&

View File

@ -26,6 +26,10 @@ public class FindFileStructureActionRequestTests extends AbstractStreamableTestC
request.setLinesToSample(randomIntBetween(10, 2000));
}
if (randomBoolean()) {
request.setLineMergeSizeLimit(randomIntBetween(1000, 20000));
}
if (randomBoolean()) {
request.setCharset(randomAlphaOfLength(10));
}
@ -85,6 +89,18 @@ public class FindFileStructureActionRequestTests extends AbstractStreamableTestC
assertThat(e.getMessage(), containsString(" [lines_to_sample] must be positive if specified"));
}
public void testValidateLineMergeSizeLimit() {
FindFileStructureAction.Request request = new FindFileStructureAction.Request();
request.setLineMergeSizeLimit(randomIntBetween(-1, 0));
request.setSample(new BytesArray("foo\n"));
ActionRequestValidationException e = request.validate();
assertNotNull(e);
assertThat(e.getMessage(), startsWith("Validation Failed: "));
assertThat(e.getMessage(), containsString(" [line_merge_size_limit] must be positive if specified"));
}
public void testValidateNonDelimited() {
FindFileStructureAction.Request request = new FindFileStructureAction.Request();

View File

@ -49,7 +49,7 @@ public class TransportFindFileStructureAction
FileStructureFinderManager structureFinderManager = new FileStructureFinderManager(threadPool.scheduler());
FileStructureFinder fileStructureFinder = structureFinderManager.findFileStructure(request.getLinesToSample(),
request.getSample().streamInput(), new FileStructureOverrides(request), request.getTimeout());
request.getLineMergeSizeLimit(), request.getSample().streamInput(), new FileStructureOverrides(request), request.getTimeout());
return new FindFileStructureAction.Response(fileStructureFinder.getStructure());
}

View File

@ -62,7 +62,8 @@ public class DelimitedFileStructureFinderFactory implements FileStructureFinderF
@Override
public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException {
int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker)
throws IOException {
return DelimitedFileStructureFinder.makeDelimitedFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
csvPreference, trimFields, overrides, timeoutChecker);
}

View File

@ -37,6 +37,7 @@ public interface FileStructureFinderFactory {
* @param sample A sample from the file to be ingested.
* @param charsetName The name of the character set in which the sample was provided.
* @param hasByteOrderMarker Did the sample have a byte order marker? <code>null</code> means "not relevant".
* @param lineMergeSizeLimit Maximum number of characters permitted when lines are merged to create messages.
* @param overrides Stores structure decisions that have been made by the end user, and should
* take precedence over anything the {@link FileStructureFinder} may decide.
* @param timeoutChecker Will abort the operation if its timeout is exceeded.
@ -44,5 +45,6 @@ public interface FileStructureFinderFactory {
* @throws Exception if something goes wrong during creation.
*/
FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws Exception;
int lineMergeSizeLimit, FileStructureOverrides overrides,
TimeoutChecker timeoutChecker) throws Exception;
}

View File

@ -43,6 +43,7 @@ public final class FileStructureFinderManager {
public static final int MIN_SAMPLE_LINE_COUNT = 2;
public static final int DEFAULT_IDEAL_SAMPLE_LINE_COUNT = 1000;
public static final int DEFAULT_LINE_MERGE_SIZE_LIMIT = 10000;
static final Set<String> FILEBEAT_SUPPORTED_ENCODINGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
"866", "ansi_x3.4-1968", "arabic", "ascii", "asmo-708", "big5", "big5-hkscs", "chinese", "cn-big5", "cp1250", "cp1251", "cp1252",
@ -96,8 +97,9 @@ public final class FileStructureFinderManager {
this.scheduler = Objects.requireNonNull(scheduler);
}
public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile) throws Exception {
return findFileStructure(idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, null);
public FileStructureFinder findFileStructure(Integer idealSampleLineCount, Integer lineMergeSizeLimit,
InputStream fromFile) throws Exception {
return findFileStructure(idealSampleLineCount, lineMergeSizeLimit, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, null);
}
/**
@ -106,6 +108,8 @@ public final class FileStructureFinderManager {
* If the stream has fewer lines then an attempt will still be made, providing at
* least {@link #MIN_SAMPLE_LINE_COUNT} lines can be read. If <code>null</code>
* the value of {@link #DEFAULT_IDEAL_SAMPLE_LINE_COUNT} will be used.
* @param lineMergeSizeLimit Maximum number of characters permitted when lines are merged to create messages.
* If <code>null</code> the value of {@link #DEFAULT_LINE_MERGE_SIZE_LIMIT} will be used.
* @param fromFile A stream from which the sample will be read.
* @param overrides Aspects of the file structure that are known in advance. These take precedence over
* values determined by structure analysis. An exception will be thrown if the file structure
@ -116,20 +120,21 @@ public final class FileStructureFinderManager {
* @return A {@link FileStructureFinder} object from which the structure and messages can be queried.
* @throws Exception A variety of problems could occur at various stages of the structure finding process.
*/
public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile, FileStructureOverrides overrides,
TimeValue timeout)
throws Exception {
return findFileStructure(new ArrayList<>(), (idealSampleLineCount == null) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount,
fromFile, overrides, timeout);
}
public FileStructureFinder findFileStructure(List<String> explanation, int idealSampleLineCount, InputStream fromFile)
throws Exception {
return findFileStructure(explanation, idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, null);
}
public FileStructureFinder findFileStructure(List<String> explanation, int idealSampleLineCount, InputStream fromFile,
public FileStructureFinder findFileStructure(Integer idealSampleLineCount, Integer lineMergeSizeLimit, InputStream fromFile,
FileStructureOverrides overrides, TimeValue timeout) throws Exception {
return findFileStructure(new ArrayList<>(), (idealSampleLineCount == null) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount,
(lineMergeSizeLimit == null) ? DEFAULT_LINE_MERGE_SIZE_LIMIT : lineMergeSizeLimit, fromFile, overrides, timeout);
}
public FileStructureFinder findFileStructure(List<String> explanation, int idealSampleLineCount, int lineMergeSizeLimit,
InputStream fromFile) throws Exception {
return findFileStructure(explanation, idealSampleLineCount, lineMergeSizeLimit, fromFile, FileStructureOverrides.EMPTY_OVERRIDES,
null);
}
public FileStructureFinder findFileStructure(List<String> explanation, int idealSampleLineCount, int lineMergeSizeLimit,
InputStream fromFile, FileStructureOverrides overrides,
TimeValue timeout) throws Exception {
try (TimeoutChecker timeoutChecker = new TimeoutChecker("structure analysis", timeout, scheduler)) {
@ -148,7 +153,8 @@ public final class FileStructureFinderManager {
Tuple<String, Boolean> sampleInfo = sampleFile(sampleReader, charsetName, MIN_SAMPLE_LINE_COUNT,
Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount), timeoutChecker);
return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2(), overrides, timeoutChecker);
return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2(), lineMergeSizeLimit, overrides,
timeoutChecker);
} catch (Exception e) {
// Add a dummy exception containing the explanation so far - this can be invaluable for troubleshooting as incorrect
// decisions made early on in the structure analysis can result in seemingly crazy decisions or timeouts later on
@ -263,7 +269,8 @@ public final class FileStructureFinderManager {
}
FileStructureFinder makeBestStructureFinder(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws Exception {
int lineMergeSizeLimit, FileStructureOverrides overrides,
TimeoutChecker timeoutChecker) throws Exception {
Character delimiter = overrides.getDelimiter();
Character quote = overrides.getQuote();
@ -295,7 +302,8 @@ public final class FileStructureFinderManager {
for (FileStructureFinderFactory factory : factories) {
timeoutChecker.check("high level format detection");
if (factory.canCreateFromSample(explanation, sample)) {
return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, overrides, timeoutChecker);
return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, lineMergeSizeLimit, overrides,
timeoutChecker);
}
}

View File

@ -68,7 +68,8 @@ public class NdJsonFileStructureFinderFactory implements FileStructureFinderFact
@Override
public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException {
int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker)
throws IOException {
return NdJsonFileStructureFinder.makeNdJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides,
timeoutChecker);
}

View File

@ -6,6 +6,7 @@
package org.elasticsearch.xpack.ml.filestructurefinder;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction;
import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats;
import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
@ -24,8 +25,8 @@ public class TextLogFileStructureFinder implements FileStructureFinder {
private final FileStructure structure;
static TextLogFileStructureFinder makeTextLogFileStructureFinder(List<String> explanation, String sample, String charsetName,
Boolean hasByteOrderMarker, FileStructureOverrides overrides,
TimeoutChecker timeoutChecker) {
Boolean hasByteOrderMarker, int lineMergeSizeLimit,
FileStructureOverrides overrides, TimeoutChecker timeoutChecker) {
String[] sampleLines = sample.split("\n");
TimestampFormatFinder timestampFormatFinder = populateTimestampFormatFinder(explanation, sampleLines, overrides, timeoutChecker);
switch (timestampFormatFinder.getNumMatchedFormats()) {
@ -69,6 +70,16 @@ public class TextLogFileStructureFinder implements FileStructureFinder {
// for the CSV header or lines before the first XML document starts)
++linesConsumed;
} else {
// This check avoids subsequent problems when a massive message is unwieldy and slow to process
long lengthAfterAppend = message.length() + 1L + sampleLine.length();
if (lengthAfterAppend > lineMergeSizeLimit) {
assert linesInMessage > 0;
throw new IllegalArgumentException("Merging lines into messages resulted in an unacceptably long message. "
+ "Merged message would have [" + (linesInMessage + 1) + "] lines and [" + lengthAfterAppend + "] "
+ "characters (limit [" + lineMergeSizeLimit + "]). If you have messages this big please increase "
+ "the value of [" + FindFileStructureAction.Request.LINE_MERGE_SIZE_LIMIT + "]. Otherwise it "
+ "probably means the timestamp has been incorrectly detected, so try overriding that.");
}
message.append('\n').append(sampleLine);
++linesInMessage;
}

View File

@ -41,8 +41,8 @@ public class TextLogFileStructureFinderFactory implements FileStructureFinderFac
@Override
public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
FileStructureOverrides overrides, TimeoutChecker timeoutChecker) {
int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker) {
return TextLogFileStructureFinder.makeTextLogFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
overrides, timeoutChecker);
lineMergeSizeLimit, overrides, timeoutChecker);
}
}

View File

@ -125,7 +125,7 @@ public class XmlFileStructureFinderFactory implements FileStructureFinderFactory
@Override
public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
FileStructureOverrides overrides, TimeoutChecker timeoutChecker)
int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker)
throws IOException, ParserConfigurationException, SAXException {
return XmlFileStructureFinder.makeXmlFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides,
timeoutChecker);

View File

@ -53,6 +53,8 @@ public class RestFindFileStructureAction extends BaseRestHandler {
FindFileStructureAction.Request request = new FindFileStructureAction.Request();
request.setLinesToSample(restRequest.paramAsInt(FindFileStructureAction.Request.LINES_TO_SAMPLE.getPreferredName(),
FileStructureFinderManager.DEFAULT_IDEAL_SAMPLE_LINE_COUNT));
request.setLineMergeSizeLimit(restRequest.paramAsInt(FindFileStructureAction.Request.LINE_MERGE_SIZE_LIMIT.getPreferredName(),
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT));
request.setTimeout(TimeValue.parseTimeValue(restRequest.param(FindFileStructureAction.Request.TIMEOUT.getPreferredName()),
DEFAULT_TIMEOUT, FindFileStructureAction.Request.TIMEOUT.getPreferredName()));
request.setCharset(restRequest.param(FindFileStructureAction.Request.CHARSET.getPreferredName()));

View File

@ -30,7 +30,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();
@ -64,8 +64,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides,
NOOP_TIMEOUT_CHECKER);
FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();
@ -101,8 +101,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides,
NOOP_TIMEOUT_CHECKER);
FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();
@ -135,7 +135,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();
@ -170,7 +170,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();
@ -214,8 +214,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides,
NOOP_TIMEOUT_CHECKER);
FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();
@ -255,7 +255,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();
@ -301,8 +301,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides,
NOOP_TIMEOUT_CHECKER);
FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();
@ -340,7 +340,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();

View File

@ -102,7 +102,8 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase {
public void testMakeBestStructureGivenNdJson() throws Exception {
assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(),
randomBoolean(), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(NdJsonFileStructureFinder.class));
randomBoolean(), FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER),
instanceOf(NdJsonFileStructureFinder.class));
}
public void testMakeBestStructureGivenNdJsonAndDelimitedOverride() throws Exception {
@ -113,12 +114,14 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase {
.setFormat(FileStructure.Format.DELIMITED).setQuote('\'').build();
assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(),
randomBoolean(), overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class));
randomBoolean(), FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER),
instanceOf(DelimitedFileStructureFinder.class));
}
public void testMakeBestStructureGivenXml() throws Exception {
assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(XmlFileStructureFinder.class));
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER),
instanceOf(XmlFileStructureFinder.class));
}
public void testMakeBestStructureGivenXmlAndTextOverride() throws Exception {
@ -126,12 +129,14 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase {
FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.SEMI_STRUCTURED_TEXT).build();
assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
overrides, NOOP_TIMEOUT_CHECKER), instanceOf(TextLogFileStructureFinder.class));
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER),
instanceOf(TextLogFileStructureFinder.class));
}
public void testMakeBestStructureGivenCsv() throws Exception {
assertThat(structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class));
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER),
instanceOf(DelimitedFileStructureFinder.class));
}
public void testMakeBestStructureGivenCsvAndJsonOverride() {
@ -140,14 +145,15 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase {
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
overrides, NOOP_TIMEOUT_CHECKER));
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER));
assertEquals("Input did not match the specified format [ndjson]", e.getMessage());
}
public void testMakeBestStructureGivenText() throws Exception {
assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(TextLogFileStructureFinder.class));
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER),
instanceOf(TextLogFileStructureFinder.class));
}
public void testMakeBestStructureGivenTextAndDelimitedOverride() throws Exception {
@ -157,7 +163,8 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase {
.setFormat(FileStructure.Format.DELIMITED).setDelimiter(':').build();
assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class));
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER),
instanceOf(DelimitedFileStructureFinder.class));
}
public void testFindFileStructureTimeout() throws IOException, InterruptedException {
@ -190,7 +197,8 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase {
junkProducer.start();
ElasticsearchTimeoutException e = expectThrows(ElasticsearchTimeoutException.class,
() -> structureFinderManager.findFileStructure(explanation, linesOfJunk - 1, bigInput, EMPTY_OVERRIDES, timeout));
() -> structureFinderManager.findFileStructure(explanation, FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT,
linesOfJunk - 1, bigInput, EMPTY_OVERRIDES, timeout));
assertThat(e.getMessage(), startsWith("Aborting structure analysis during ["));
assertThat(e.getMessage(), endsWith("] as it has taken longer than the timeout of [" + timeout + "]"));

View File

@ -19,7 +19,7 @@ public class NdJsonFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = factory.createFromSample(explanation, NDJSON_SAMPLE, charset, hasByteOrderMarker,
FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();

View File

@ -20,13 +20,36 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
private FileStructureFinderFactory factory = new TextLogFileStructureFinderFactory();
public void testCreateConfigsGivenLowLineMergeSizeLimit() {
String sample = "2019-05-16 16:56:14 line 1 abcdefghijklmnopqrstuvwxyz\n" +
"2019-05-16 16:56:14 line 2 abcdefghijklmnopqrstuvwxyz\n" +
"continuation line 2.1\n" +
"continuation line 2.2\n" +
"continuation line 2.3\n" +
"continuation line 2.4\n" +
"2019-05-16 16:56:14 line 3 abcdefghijklmnopqrstuvwxyz\n";
assertTrue(factory.canCreateFromSample(explanation, sample));
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, 100,
FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER));
assertEquals("Merging lines into messages resulted in an unacceptably long message. Merged message would have [4] lines and "
+ "[119] characters (limit [100]). If you have messages this big please increase the value of [line_merge_size_limit]. "
+ "Otherwise it probably means the timestamp has been incorrectly detected, so try overriding that.", e.getMessage());
}
public void testCreateConfigsGivenElasticsearchLog() throws Exception {
assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker,
FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();
@ -66,8 +89,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides,
NOOP_TIMEOUT_CHECKER);
FileStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();
@ -102,8 +125,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides,
NOOP_TIMEOUT_CHECKER);
FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker,
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();
@ -139,8 +162,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides,
NOOP_TIMEOUT_CHECKER);
FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker,
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();
@ -181,7 +204,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides, NOOP_TIMEOUT_CHECKER));
() -> factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker,
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER));
assertEquals("Supplied Grok pattern [\\[%{LOGLEVEL:loglevel} *\\]\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] " +
"\\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}] does not match sample messages", e.getMessage());
@ -200,8 +224,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, FileStructureOverrides.EMPTY_OVERRIDES,
NOOP_TIMEOUT_CHECKER));
() -> factory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER));
assertEquals("Failed to create more than one message from the sample lines provided. (The last is discarded in "
+ "case the sample is incomplete.) If your sample does contain multiple messages the problem is probably that "

View File

@ -19,7 +19,7 @@ public class XmlFileStructureFinderTests extends FileStructureTestCase {
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = factory.createFromSample(explanation, XML_SAMPLE, charset, hasByteOrderMarker,
FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();

View File

@ -11,6 +11,11 @@
"description": "How many lines of the file should be included in the analysis",
"default": 1000
},
"line_merge_size_limit": {
"type": "int",
"description": "Maximum number of characters permitted in a single message when lines are merged to create messages.",
"default": 10000
},
"timeout": {
"type": "time",
"description": "Timeout after which the analysis will be aborted",

View File

@ -10,6 +10,7 @@ setup:
Content-Type: "application/json"
ml.find_file_structure:
lines_to_sample: 3
line_merge_size_limit: 1234
timeout: 10s
body:
- airline: AAL