[ML] Rename the json file structure to ndjson (#34901)
The file structure finder endpoint can find the NDJSON (newline-delimited JSON) file format, but called it `json`. This change renames the `format` for this file structure to `ndjson`, which is more precise and will hopefully avoid confusion.
This commit is contained in:
parent
f13d529448
commit
c455be7bc2
|
@ -74,7 +74,7 @@ chosen.
|
||||||
structure finder produced its result. The default value is `false`.
|
structure finder produced its result. The default value is `false`.
|
||||||
|
|
||||||
`format`::
|
`format`::
|
||||||
(string) The high level structure of the file. Valid values are `json`, `xml`,
|
(string) The high level structure of the file. Valid values are `ndjson`, `xml`,
|
||||||
`delimited`, and `semi_structured_text`. If this parameter is not specified,
|
`delimited`, and `semi_structured_text`. If this parameter is not specified,
|
||||||
the structure finder chooses one.
|
the structure finder chooses one.
|
||||||
|
|
||||||
|
@ -259,7 +259,7 @@ If the request does not encounter errors, you receive the following result:
|
||||||
"sample_start" : "{\"name\": \"Leviathan Wakes\", \"author\": \"James S.A. Corey\", \"release_date\": \"2011-06-02\", \"page_count\": 561}\n{\"name\": \"Hyperion\", \"author\": \"Dan Simmons\", \"release_date\": \"1989-05-26\", \"page_count\": 482}\n", <3>
|
"sample_start" : "{\"name\": \"Leviathan Wakes\", \"author\": \"James S.A. Corey\", \"release_date\": \"2011-06-02\", \"page_count\": 561}\n{\"name\": \"Hyperion\", \"author\": \"Dan Simmons\", \"release_date\": \"1989-05-26\", \"page_count\": 482}\n", <3>
|
||||||
"charset" : "UTF-8", <4>
|
"charset" : "UTF-8", <4>
|
||||||
"has_byte_order_marker" : false, <5>
|
"has_byte_order_marker" : false, <5>
|
||||||
"format" : "json", <6>
|
"format" : "ndjson", <6>
|
||||||
"need_client_timezone" : false, <7>
|
"need_client_timezone" : false, <7>
|
||||||
"mappings" : { <8>
|
"mappings" : { <8>
|
||||||
"author" : {
|
"author" : {
|
||||||
|
@ -473,14 +473,14 @@ If the request does not encounter errors, you receive the following result:
|
||||||
|
|
||||||
<1> `num_lines_analyzed` indicates how many lines of the file were analyzed.
|
<1> `num_lines_analyzed` indicates how many lines of the file were analyzed.
|
||||||
<2> `num_messages_analyzed` indicates how many distinct messages the lines contained.
|
<2> `num_messages_analyzed` indicates how many distinct messages the lines contained.
|
||||||
For ND-JSON, this value is the same as `num_lines_analyzed`. For other file
|
For NDJSON, this value is the same as `num_lines_analyzed`. For other file
|
||||||
formats, messages can span several lines.
|
formats, messages can span several lines.
|
||||||
<3> `sample_start` reproduces the first two messages in the file verbatim. This
|
<3> `sample_start` reproduces the first two messages in the file verbatim. This
|
||||||
may help to diagnose parse errors or accidental uploads of the wrong file.
|
may help to diagnose parse errors or accidental uploads of the wrong file.
|
||||||
<4> `charset` indicates the character encoding used to parse the file.
|
<4> `charset` indicates the character encoding used to parse the file.
|
||||||
<5> For UTF character encodings, `has_byte_order_marker` indicates whether the
|
<5> For UTF character encodings, `has_byte_order_marker` indicates whether the
|
||||||
file begins with a byte order marker.
|
file begins with a byte order marker.
|
||||||
<6> `format` is one of `json`, `xml`, `delimited` or `semi_structured_text`.
|
<6> `format` is one of `ndjson`, `xml`, `delimited` or `semi_structured_text`.
|
||||||
<7> If a timestamp format is detected that does not include a timezone,
|
<7> If a timestamp format is detected that does not include a timezone,
|
||||||
`need_client_timezone` will be `true`. The server that parses the file must
|
`need_client_timezone` will be `true`. The server that parses the file must
|
||||||
therefore be told the correct timezone by the client.
|
therefore be told the correct timezone by the client.
|
||||||
|
|
|
@ -32,11 +32,11 @@ public class FileStructure implements ToXContentObject, Writeable {
|
||||||
|
|
||||||
public enum Format {
|
public enum Format {
|
||||||
|
|
||||||
JSON, XML, DELIMITED, SEMI_STRUCTURED_TEXT;
|
NDJSON, XML, DELIMITED, SEMI_STRUCTURED_TEXT;
|
||||||
|
|
||||||
public boolean supportsNesting() {
|
public boolean supportsNesting() {
|
||||||
switch (this) {
|
switch (this) {
|
||||||
case JSON:
|
case NDJSON:
|
||||||
case XML:
|
case XML:
|
||||||
return true;
|
return true;
|
||||||
case DELIMITED:
|
case DELIMITED:
|
||||||
|
@ -49,7 +49,7 @@ public class FileStructure implements ToXContentObject, Writeable {
|
||||||
|
|
||||||
public boolean isStructured() {
|
public boolean isStructured() {
|
||||||
switch (this) {
|
switch (this) {
|
||||||
case JSON:
|
case NDJSON:
|
||||||
case XML:
|
case XML:
|
||||||
case DELIMITED:
|
case DELIMITED:
|
||||||
return true;
|
return true;
|
||||||
|
@ -62,7 +62,7 @@ public class FileStructure implements ToXContentObject, Writeable {
|
||||||
|
|
||||||
public boolean isSemiStructured() {
|
public boolean isSemiStructured() {
|
||||||
switch (this) {
|
switch (this) {
|
||||||
case JSON:
|
case NDJSON:
|
||||||
case XML:
|
case XML:
|
||||||
case DELIMITED:
|
case DELIMITED:
|
||||||
return false;
|
return false;
|
||||||
|
@ -645,7 +645,7 @@ public class FileStructure implements ToXContentObject, Writeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (format) {
|
switch (format) {
|
||||||
case JSON:
|
case NDJSON:
|
||||||
if (shouldTrimFields != null) {
|
if (shouldTrimFields != null) {
|
||||||
throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
|
throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
|
||||||
}
|
}
|
||||||
|
|
|
@ -124,7 +124,7 @@ public class FindFileStructureActionRequestTests extends AbstractStreamableTestC
|
||||||
public void testValidateNonSemiStructuredText() {
|
public void testValidateNonSemiStructuredText() {
|
||||||
|
|
||||||
FindFileStructureAction.Request request = new FindFileStructureAction.Request();
|
FindFileStructureAction.Request request = new FindFileStructureAction.Request();
|
||||||
request.setFormat(randomFrom(FileStructure.Format.JSON, FileStructure.Format.XML, FileStructure.Format.DELIMITED));
|
request.setFormat(randomFrom(FileStructure.Format.NDJSON, FileStructure.Format.XML, FileStructure.Format.DELIMITED));
|
||||||
request.setGrokPattern(randomAlphaOfLength(80));
|
request.setGrokPattern(randomAlphaOfLength(80));
|
||||||
request.setSample(new BytesArray("foo\n"));
|
request.setSample(new BytesArray("foo\n"));
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ import java.util.stream.Collectors;
|
||||||
* Runs the high-level steps needed to create ingest configs for the specified file. In order:
|
* Runs the high-level steps needed to create ingest configs for the specified file. In order:
|
||||||
* 1. Determine the most likely character set (UTF-8, UTF-16LE, ISO-8859-2, etc.)
|
* 1. Determine the most likely character set (UTF-8, UTF-16LE, ISO-8859-2, etc.)
|
||||||
* 2. Load a sample of the file, consisting of the first 1000 lines of the file
|
* 2. Load a sample of the file, consisting of the first 1000 lines of the file
|
||||||
* 3. Determine the most likely file structure - one of ND-JSON, XML, delimited or semi-structured text
|
* 3. Determine the most likely file structure - one of NDJSON, XML, delimited or semi-structured text
|
||||||
* 4. Create an appropriate structure object and delegate writing configs to it
|
* 4. Create an appropriate structure object and delegate writing configs to it
|
||||||
*/
|
*/
|
||||||
public final class FileStructureFinderManager {
|
public final class FileStructureFinderManager {
|
||||||
|
@ -73,9 +73,9 @@ public final class FileStructureFinderManager {
|
||||||
* These need to be ordered so that the more generic formats come after the more specific ones
|
* These need to be ordered so that the more generic formats come after the more specific ones
|
||||||
*/
|
*/
|
||||||
private static final List<FileStructureFinderFactory> ORDERED_STRUCTURE_FACTORIES = Collections.unmodifiableList(Arrays.asList(
|
private static final List<FileStructureFinderFactory> ORDERED_STRUCTURE_FACTORIES = Collections.unmodifiableList(Arrays.asList(
|
||||||
new JsonFileStructureFinderFactory(),
|
new NdJsonFileStructureFinderFactory(),
|
||||||
new XmlFileStructureFinderFactory(),
|
new XmlFileStructureFinderFactory(),
|
||||||
// ND-JSON will often also be valid (although utterly weird) CSV, so JSON must come before CSV
|
// NDJSON will often also be valid (although utterly weird) CSV, so NDJSON must come before CSV
|
||||||
new DelimitedFileStructureFinderFactory(',', '"', 2, false),
|
new DelimitedFileStructureFinderFactory(',', '"', 2, false),
|
||||||
new DelimitedFileStructureFinderFactory('\t', '"', 2, false),
|
new DelimitedFileStructureFinderFactory('\t', '"', 2, false),
|
||||||
new DelimitedFileStructureFinderFactory(';', '"', 4, false),
|
new DelimitedFileStructureFinderFactory(';', '"', 4, false),
|
||||||
|
|
|
@ -25,16 +25,16 @@ import java.util.stream.Collectors;
|
||||||
import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
|
import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Really ND-JSON.
|
* Newline-delimited JSON.
|
||||||
*/
|
*/
|
||||||
public class JsonFileStructureFinder implements FileStructureFinder {
|
public class NdJsonFileStructureFinder implements FileStructureFinder {
|
||||||
|
|
||||||
private final List<String> sampleMessages;
|
private final List<String> sampleMessages;
|
||||||
private final FileStructure structure;
|
private final FileStructure structure;
|
||||||
|
|
||||||
static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanation, String sample, String charsetName,
|
static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List<String> explanation, String sample, String charsetName,
|
||||||
Boolean hasByteOrderMarker, FileStructureOverrides overrides,
|
Boolean hasByteOrderMarker, FileStructureOverrides overrides,
|
||||||
TimeoutChecker timeoutChecker) throws IOException {
|
TimeoutChecker timeoutChecker) throws IOException {
|
||||||
|
|
||||||
List<Map<String, ?>> sampleRecords = new ArrayList<>();
|
List<Map<String, ?>> sampleRecords = new ArrayList<>();
|
||||||
|
|
||||||
|
@ -43,10 +43,10 @@ public class JsonFileStructureFinder implements FileStructureFinder {
|
||||||
XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY, DeprecationHandler.THROW_UNSUPPORTED_OPERATION,
|
XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY, DeprecationHandler.THROW_UNSUPPORTED_OPERATION,
|
||||||
sampleMessage);
|
sampleMessage);
|
||||||
sampleRecords.add(parser.mapOrdered());
|
sampleRecords.add(parser.mapOrdered());
|
||||||
timeoutChecker.check("JSON parsing");
|
timeoutChecker.check("NDJSON parsing");
|
||||||
}
|
}
|
||||||
|
|
||||||
FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.JSON)
|
FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.NDJSON)
|
||||||
.setCharset(charsetName)
|
.setCharset(charsetName)
|
||||||
.setHasByteOrderMarker(hasByteOrderMarker)
|
.setHasByteOrderMarker(hasByteOrderMarker)
|
||||||
.setSampleStart(sampleMessages.stream().limit(2).collect(Collectors.joining("\n", "", "\n")))
|
.setSampleStart(sampleMessages.stream().limit(2).collect(Collectors.joining("\n", "", "\n")))
|
||||||
|
@ -84,10 +84,10 @@ public class JsonFileStructureFinder implements FileStructureFinder {
|
||||||
.setExplanation(explanation)
|
.setExplanation(explanation)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
return new JsonFileStructureFinder(sampleMessages, structure);
|
return new NdJsonFileStructureFinder(sampleMessages, structure);
|
||||||
}
|
}
|
||||||
|
|
||||||
private JsonFileStructureFinder(List<String> sampleMessages, FileStructure structure) {
|
private NdJsonFileStructureFinder(List<String> sampleMessages, FileStructure structure) {
|
||||||
this.sampleMessages = Collections.unmodifiableList(sampleMessages);
|
this.sampleMessages = Collections.unmodifiableList(sampleMessages);
|
||||||
this.structure = structure;
|
this.structure = structure;
|
||||||
}
|
}
|
|
@ -17,15 +17,15 @@ import java.util.Locale;
|
||||||
|
|
||||||
import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
|
import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
|
||||||
|
|
||||||
public class JsonFileStructureFinderFactory implements FileStructureFinderFactory {
|
public class NdJsonFileStructureFinderFactory implements FileStructureFinderFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean canFindFormat(FileStructure.Format format) {
|
public boolean canFindFormat(FileStructure.Format format) {
|
||||||
return format == null || format == FileStructure.Format.JSON;
|
return format == null || format == FileStructure.Format.NDJSON;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This format matches if the sample consists of one or more JSON documents.
|
* This format matches if the sample consists of one or more NDJSON documents.
|
||||||
* If there is more than one, they must be newline-delimited. The
|
* If there is more than one, they must be newline-delimited. The
|
||||||
* documents must be non-empty, to prevent lines containing "{}" from matching.
|
* documents must be non-empty, to prevent lines containing "{}" from matching.
|
||||||
*/
|
*/
|
||||||
|
@ -41,35 +41,35 @@ public class JsonFileStructureFinderFactory implements FileStructureFinderFactor
|
||||||
DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader(sampleLine))) {
|
DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader(sampleLine))) {
|
||||||
|
|
||||||
if (parser.map().isEmpty()) {
|
if (parser.map().isEmpty()) {
|
||||||
explanation.add("Not JSON because an empty object was parsed: [" + sampleLine + "]");
|
explanation.add("Not NDJSON because an empty object was parsed: [" + sampleLine + "]");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
++completeDocCount;
|
++completeDocCount;
|
||||||
if (parser.nextToken() != null) {
|
if (parser.nextToken() != null) {
|
||||||
explanation.add("Not newline delimited JSON because a line contained more than a single object: [" +
|
explanation.add("Not newline delimited NDJSON because a line contained more than a single object: [" +
|
||||||
sampleLine + "]");
|
sampleLine + "]");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (IOException | IllegalStateException e) {
|
} catch (IOException | IllegalStateException e) {
|
||||||
explanation.add("Not JSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
|
explanation.add("Not NDJSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (completeDocCount == 0) {
|
if (completeDocCount == 0) {
|
||||||
explanation.add("Not JSON because sample didn't contain a complete document");
|
explanation.add("Not NDJSON because sample didn't contain a complete document");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
explanation.add("Deciding sample is newline delimited JSON");
|
explanation.add("Deciding sample is newline delimited NDJSON");
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
|
public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
|
||||||
FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException {
|
FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException {
|
||||||
return JsonFileStructureFinder.makeJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides,
|
return NdJsonFileStructureFinder.makeNdJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides,
|
||||||
timeoutChecker);
|
timeoutChecker);
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,7 +12,7 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC
|
||||||
private FileStructureFinderFactory semiColonDelimitedfactory = new DelimitedFileStructureFinderFactory(';', '"', 4, false);
|
private FileStructureFinderFactory semiColonDelimitedfactory = new DelimitedFileStructureFinderFactory(';', '"', 4, false);
|
||||||
private FileStructureFinderFactory pipeDelimitedFactory = new DelimitedFileStructureFinderFactory('|', '"', 5, true);
|
private FileStructureFinderFactory pipeDelimitedFactory = new DelimitedFileStructureFinderFactory('|', '"', 5, true);
|
||||||
|
|
||||||
// CSV - no need to check JSON or XML because they come earlier in the order we check formats
|
// CSV - no need to check NDJSON or XML because they come earlier in the order we check formats
|
||||||
|
|
||||||
public void testCanCreateCsvFromSampleGivenCsv() {
|
public void testCanCreateCsvFromSampleGivenCsv() {
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC
|
||||||
assertFalse(csvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
|
assertFalse(csvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
|
||||||
}
|
}
|
||||||
|
|
||||||
// TSV - no need to check JSON, XML or CSV because they come earlier in the order we check formats
|
// TSV - no need to check NDJSON, XML or CSV because they come earlier in the order we check formats
|
||||||
|
|
||||||
public void testCanCreateTsvFromSampleGivenTsv() {
|
public void testCanCreateTsvFromSampleGivenTsv() {
|
||||||
|
|
||||||
|
@ -61,7 +61,7 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC
|
||||||
assertFalse(tsvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
|
assertFalse(tsvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Semi-colon delimited - no need to check JSON, XML, CSV or TSV because they come earlier in the order we check formats
|
// Semi-colon delimited - no need to check NDJSON, XML, CSV or TSV because they come earlier in the order we check formats
|
||||||
|
|
||||||
public void testCanCreateSemiColonDelimitedFromSampleGivenSemiColonDelimited() {
|
public void testCanCreateSemiColonDelimitedFromSampleGivenSemiColonDelimited() {
|
||||||
|
|
||||||
|
@ -78,7 +78,7 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC
|
||||||
assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, TEXT_SAMPLE));
|
assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, TEXT_SAMPLE));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pipe delimited - no need to check JSON, XML, CSV, TSV or semi-colon delimited
|
// Pipe delimited - no need to check NDJSON, XML, CSV, TSV or semi-colon delimited
|
||||||
// values because they come earlier in the order we check formats
|
// values because they come earlier in the order we check formats
|
||||||
|
|
||||||
public void testCanCreatePipeDelimitedFromSampleGivenPipeDelimited() {
|
public void testCanCreatePipeDelimitedFromSampleGivenPipeDelimited() {
|
||||||
|
|
|
@ -73,20 +73,20 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testMakeBestStructureGivenJson() throws Exception {
|
public void testMakeBestStructureGivenNdJson() throws Exception {
|
||||||
assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
|
assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(),
|
||||||
EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(JsonFileStructureFinder.class));
|
randomBoolean(), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(NdJsonFileStructureFinder.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testMakeBestStructureGivenJsonAndDelimitedOverride() throws Exception {
|
public void testMakeBestStructureGivenNdJsonAndDelimitedOverride() throws Exception {
|
||||||
|
|
||||||
// Need to change the quote character from the default of double quotes
|
// Need to change the quote character from the default of double quotes
|
||||||
// otherwise the quotes in the JSON will stop it parsing as CSV
|
// otherwise the quotes in the NDJSON will stop it parsing as CSV
|
||||||
FileStructureOverrides overrides = FileStructureOverrides.builder()
|
FileStructureOverrides overrides = FileStructureOverrides.builder()
|
||||||
.setFormat(FileStructure.Format.DELIMITED).setQuote('\'').build();
|
.setFormat(FileStructure.Format.DELIMITED).setQuote('\'').build();
|
||||||
|
|
||||||
assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
|
assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(),
|
||||||
overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class));
|
randomBoolean(), overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testMakeBestStructureGivenXml() throws Exception {
|
public void testMakeBestStructureGivenXml() throws Exception {
|
||||||
|
@ -109,13 +109,13 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase {
|
||||||
|
|
||||||
public void testMakeBestStructureGivenCsvAndJsonOverride() {
|
public void testMakeBestStructureGivenCsvAndJsonOverride() {
|
||||||
|
|
||||||
FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.JSON).build();
|
FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.NDJSON).build();
|
||||||
|
|
||||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||||
() -> structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
|
() -> structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
|
||||||
overrides, NOOP_TIMEOUT_CHECKER));
|
overrides, NOOP_TIMEOUT_CHECKER));
|
||||||
|
|
||||||
assertEquals("Input did not match the specified format [json]", e.getMessage());
|
assertEquals("Input did not match the specified format [ndjson]", e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testMakeBestStructureGivenText() throws Exception {
|
public void testMakeBestStructureGivenText() throws Exception {
|
||||||
|
|
|
@ -27,7 +27,7 @@ public abstract class FileStructureTestCase extends ESTestCase {
|
||||||
"2018-05-17T16:23:40,key1,42.0\n" +
|
"2018-05-17T16:23:40,key1,42.0\n" +
|
||||||
"2018-05-17T16:24:11,\"key with spaces\",42.0\n";
|
"2018-05-17T16:24:11,\"key with spaces\",42.0\n";
|
||||||
|
|
||||||
protected static final String JSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
|
protected static final String NDJSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
|
||||||
"\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 1\",\"class\":\"ml\"," +
|
"\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 1\",\"class\":\"ml\"," +
|
||||||
"\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n" +
|
"\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n" +
|
||||||
"{\"logger\":\"controller\",\"timestamp\":1478261151445," +
|
"{\"logger\":\"controller\",\"timestamp\":1478261151445," +
|
||||||
|
|
|
@ -9,21 +9,21 @@ import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
|
||||||
|
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
|
||||||
public class JsonFileStructureFinderTests extends FileStructureTestCase {
|
public class NdJsonFileStructureFinderTests extends FileStructureTestCase {
|
||||||
|
|
||||||
private FileStructureFinderFactory factory = new JsonFileStructureFinderFactory();
|
private FileStructureFinderFactory factory = new NdJsonFileStructureFinderFactory();
|
||||||
|
|
||||||
public void testCreateConfigsGivenGoodJson() throws Exception {
|
public void testCreateConfigsGivenGoodJson() throws Exception {
|
||||||
assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
|
assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE));
|
||||||
|
|
||||||
String charset = randomFrom(POSSIBLE_CHARSETS);
|
String charset = randomFrom(POSSIBLE_CHARSETS);
|
||||||
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
||||||
FileStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker,
|
FileStructureFinder structureFinder = factory.createFromSample(explanation, NDJSON_SAMPLE, charset, hasByteOrderMarker,
|
||||||
FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
|
FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
|
||||||
|
|
||||||
FileStructure structure = structureFinder.getStructure();
|
FileStructure structure = structureFinder.getStructure();
|
||||||
|
|
||||||
assertEquals(FileStructure.Format.JSON, structure.getFormat());
|
assertEquals(FileStructure.Format.NDJSON, structure.getFormat());
|
||||||
assertEquals(charset, structure.getCharset());
|
assertEquals(charset, structure.getCharset());
|
||||||
if (hasByteOrderMarker == null) {
|
if (hasByteOrderMarker == null) {
|
||||||
assertNull(structure.getHasByteOrderMarker());
|
assertNull(structure.getHasByteOrderMarker());
|
|
@ -5,13 +5,13 @@
|
||||||
*/
|
*/
|
||||||
package org.elasticsearch.xpack.ml.filestructurefinder;
|
package org.elasticsearch.xpack.ml.filestructurefinder;
|
||||||
|
|
||||||
public class JsonFileStructureFinderFactoryTests extends FileStructureTestCase {
|
public class NdNdJsonFileStructureFinderFactoryTests extends FileStructureTestCase {
|
||||||
|
|
||||||
private FileStructureFinderFactory factory = new JsonFileStructureFinderFactory();
|
private FileStructureFinderFactory factory = new NdJsonFileStructureFinderFactory();
|
||||||
|
|
||||||
public void testCanCreateFromSampleGivenJson() {
|
public void testCanCreateFromSampleGivenNdJson() {
|
||||||
|
|
||||||
assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
|
assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCanCreateFromSampleGivenXml() {
|
public void testCanCreateFromSampleGivenXml() {
|
|
@ -9,7 +9,7 @@ public class TextLogFileStructureFinderFactoryTests extends FileStructureTestCas
|
||||||
|
|
||||||
private FileStructureFinderFactory factory = new TextLogFileStructureFinderFactory();
|
private FileStructureFinderFactory factory = new TextLogFileStructureFinderFactory();
|
||||||
|
|
||||||
// No need to check JSON, XML, CSV, TSV, semi-colon delimited values or pipe
|
// No need to check NDJSON, XML, CSV, TSV, semi-colon delimited values or pipe
|
||||||
// delimited values because they come earlier in the order we check formats
|
// delimited values because they come earlier in the order we check formats
|
||||||
|
|
||||||
public void testCanCreateFromSampleGivenText() {
|
public void testCanCreateFromSampleGivenText() {
|
||||||
|
|
|
@ -9,7 +9,7 @@ public class XmlFileStructureFinderFactoryTests extends FileStructureTestCase {
|
||||||
|
|
||||||
private FileStructureFinderFactory factory = new XmlFileStructureFinderFactory();
|
private FileStructureFinderFactory factory = new XmlFileStructureFinderFactory();
|
||||||
|
|
||||||
// No need to check JSON because it comes earlier in the order we check formats
|
// No need to check NDJSON because it comes earlier in the order we check formats
|
||||||
|
|
||||||
public void testCanCreateFromSampleGivenXml() {
|
public void testCanCreateFromSampleGivenXml() {
|
||||||
|
|
||||||
|
|
|
@ -22,7 +22,7 @@
|
||||||
},
|
},
|
||||||
"format": {
|
"format": {
|
||||||
"type": "enum",
|
"type": "enum",
|
||||||
"options": [ "json", "xml", "delimited", "semi_structured_text" ],
|
"options": [ "ndjson", "xml", "delimited", "semi_structured_text" ],
|
||||||
"description": "Optional parameter to specify the high level file format"
|
"description": "Optional parameter to specify the high level file format"
|
||||||
},
|
},
|
||||||
"has_header_row": {
|
"has_header_row": {
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
---
|
---
|
||||||
"Test JSON file structure analysis without overrides":
|
"Test NDJSON file structure analysis without overrides":
|
||||||
- do:
|
- do:
|
||||||
headers:
|
headers:
|
||||||
# This is to stop the usual content type randomization, which
|
# This is to stop the usual content type randomization, which
|
||||||
|
@ -26,7 +26,7 @@
|
||||||
- match: { num_messages_analyzed: 3 }
|
- match: { num_messages_analyzed: 3 }
|
||||||
- match: { charset: "UTF-8" }
|
- match: { charset: "UTF-8" }
|
||||||
- match: { has_byte_order_marker: false }
|
- match: { has_byte_order_marker: false }
|
||||||
- match: { format: json }
|
- match: { format: ndjson }
|
||||||
- match: { timestamp_field: time }
|
- match: { timestamp_field: time }
|
||||||
- match: { joda_timestamp_formats.0: UNIX }
|
- match: { joda_timestamp_formats.0: UNIX }
|
||||||
- match: { java_timestamp_formats.0: UNIX }
|
- match: { java_timestamp_formats.0: UNIX }
|
||||||
|
@ -56,7 +56,7 @@
|
||||||
- is_false: explanation
|
- is_false: explanation
|
||||||
|
|
||||||
---
|
---
|
||||||
"Test JSON file structure analysis with overrides":
|
"Test NDJSON file structure analysis with overrides":
|
||||||
- do:
|
- do:
|
||||||
headers:
|
headers:
|
||||||
# This is to stop the usual content type randomization, which
|
# This is to stop the usual content type randomization, which
|
||||||
|
@ -64,7 +64,7 @@
|
||||||
Content-Type: "application/json"
|
Content-Type: "application/json"
|
||||||
xpack.ml.find_file_structure:
|
xpack.ml.find_file_structure:
|
||||||
charset: UTF-8
|
charset: UTF-8
|
||||||
format: json
|
format: ndjson
|
||||||
timestamp_field: time
|
timestamp_field: time
|
||||||
timestamp_format: UNIX
|
timestamp_format: UNIX
|
||||||
explain: true
|
explain: true
|
||||||
|
@ -86,7 +86,7 @@
|
||||||
- match: { num_messages_analyzed: 3 }
|
- match: { num_messages_analyzed: 3 }
|
||||||
- match: { charset: "UTF-8" }
|
- match: { charset: "UTF-8" }
|
||||||
- match: { has_byte_order_marker: false }
|
- match: { has_byte_order_marker: false }
|
||||||
- match: { format: json }
|
- match: { format: ndjson }
|
||||||
- match: { timestamp_field: time }
|
- match: { timestamp_field: time }
|
||||||
- match: { joda_timestamp_formats.0: UNIX }
|
- match: { joda_timestamp_formats.0: UNIX }
|
||||||
- match: { java_timestamp_formats.0: UNIX }
|
- match: { java_timestamp_formats.0: UNIX }
|
||||||
|
|
Loading…
Reference in New Issue