[ML] Rename the json file structure to ndjson (#34901)

The file structure finder endpoint can find the NDJSON
(newline-delimited JSON) file format, but called it
`json`.  This change renames the `format` for this file
structure to `ndjson`, which is more precise and will
hopefully avoid confusion.
This commit is contained in:
David Roberts 2018-10-29 10:06:12 +01:00 committed by GitHub
parent f13d529448
commit c455be7bc2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 62 additions and 62 deletions

View File

@ -74,7 +74,7 @@ chosen.
structure finder produced its result. The default value is `false`.
`format`::
(string) The high level structure of the file. Valid values are `json`, `xml`,
(string) The high level structure of the file. Valid values are `ndjson`, `xml`,
`delimited`, and `semi_structured_text`. If this parameter is not specified,
the structure finder chooses one.
@ -259,7 +259,7 @@ If the request does not encounter errors, you receive the following result:
"sample_start" : "{\"name\": \"Leviathan Wakes\", \"author\": \"James S.A. Corey\", \"release_date\": \"2011-06-02\", \"page_count\": 561}\n{\"name\": \"Hyperion\", \"author\": \"Dan Simmons\", \"release_date\": \"1989-05-26\", \"page_count\": 482}\n", <3>
"charset" : "UTF-8", <4>
"has_byte_order_marker" : false, <5>
"format" : "json", <6>
"format" : "ndjson", <6>
"need_client_timezone" : false, <7>
"mappings" : { <8>
"author" : {
@ -473,14 +473,14 @@ If the request does not encounter errors, you receive the following result:
<1> `num_lines_analyzed` indicates how many lines of the file were analyzed.
<2> `num_messages_analyzed` indicates how many distinct messages the lines contained.
For ND-JSON, this value is the same as `num_lines_analyzed`. For other file
For NDJSON, this value is the same as `num_lines_analyzed`. For other file
formats, messages can span several lines.
<3> `sample_start` reproduces the first two messages in the file verbatim. This
may help to diagnose parse errors or accidental uploads of the wrong file.
<4> `charset` indicates the character encoding used to parse the file.
<5> For UTF character encodings, `has_byte_order_marker` indicates whether the
file begins with a byte order marker.
<6> `format` is one of `json`, `xml`, `delimited` or `semi_structured_text`.
<6> `format` is one of `ndjson`, `xml`, `delimited` or `semi_structured_text`.
<7> If a timestamp format is detected that does not include a timezone,
`need_client_timezone` will be `true`. The server that parses the file must
therefore be told the correct timezone by the client.

View File

@ -32,11 +32,11 @@ public class FileStructure implements ToXContentObject, Writeable {
public enum Format {
JSON, XML, DELIMITED, SEMI_STRUCTURED_TEXT;
NDJSON, XML, DELIMITED, SEMI_STRUCTURED_TEXT;
public boolean supportsNesting() {
switch (this) {
case JSON:
case NDJSON:
case XML:
return true;
case DELIMITED:
@ -49,7 +49,7 @@ public class FileStructure implements ToXContentObject, Writeable {
public boolean isStructured() {
switch (this) {
case JSON:
case NDJSON:
case XML:
case DELIMITED:
return true;
@ -62,7 +62,7 @@ public class FileStructure implements ToXContentObject, Writeable {
public boolean isSemiStructured() {
switch (this) {
case JSON:
case NDJSON:
case XML:
case DELIMITED:
return false;
@ -645,7 +645,7 @@ public class FileStructure implements ToXContentObject, Writeable {
}
switch (format) {
case JSON:
case NDJSON:
if (shouldTrimFields != null) {
throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
}

View File

@ -124,7 +124,7 @@ public class FindFileStructureActionRequestTests extends AbstractStreamableTestC
public void testValidateNonSemiStructuredText() {
FindFileStructureAction.Request request = new FindFileStructureAction.Request();
request.setFormat(randomFrom(FileStructure.Format.JSON, FileStructure.Format.XML, FileStructure.Format.DELIMITED));
request.setFormat(randomFrom(FileStructure.Format.NDJSON, FileStructure.Format.XML, FileStructure.Format.DELIMITED));
request.setGrokPattern(randomAlphaOfLength(80));
request.setSample(new BytesArray("foo\n"));

View File

@ -35,7 +35,7 @@ import java.util.stream.Collectors;
* Runs the high-level steps needed to create ingest configs for the specified file. In order:
* 1. Determine the most likely character set (UTF-8, UTF-16LE, ISO-8859-2, etc.)
* 2. Load a sample of the file, consisting of the first 1000 lines of the file
* 3. Determine the most likely file structure - one of ND-JSON, XML, delimited or semi-structured text
* 3. Determine the most likely file structure - one of NDJSON, XML, delimited or semi-structured text
* 4. Create an appropriate structure object and delegate writing configs to it
*/
public final class FileStructureFinderManager {
@ -73,9 +73,9 @@ public final class FileStructureFinderManager {
* These need to be ordered so that the more generic formats come after the more specific ones
*/
private static final List<FileStructureFinderFactory> ORDERED_STRUCTURE_FACTORIES = Collections.unmodifiableList(Arrays.asList(
new JsonFileStructureFinderFactory(),
new NdJsonFileStructureFinderFactory(),
new XmlFileStructureFinderFactory(),
// ND-JSON will often also be valid (although utterly weird) CSV, so JSON must come before CSV
// NDJSON will often also be valid (although utterly weird) CSV, so NDJSON must come before CSV
new DelimitedFileStructureFinderFactory(',', '"', 2, false),
new DelimitedFileStructureFinderFactory('\t', '"', 2, false),
new DelimitedFileStructureFinderFactory(';', '"', 4, false),

View File

@ -25,16 +25,16 @@ import java.util.stream.Collectors;
import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
/**
* Really ND-JSON.
* Newline-delimited JSON.
*/
public class JsonFileStructureFinder implements FileStructureFinder {
public class NdJsonFileStructureFinder implements FileStructureFinder {
private final List<String> sampleMessages;
private final FileStructure structure;
static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanation, String sample, String charsetName,
Boolean hasByteOrderMarker, FileStructureOverrides overrides,
TimeoutChecker timeoutChecker) throws IOException {
static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List<String> explanation, String sample, String charsetName,
Boolean hasByteOrderMarker, FileStructureOverrides overrides,
TimeoutChecker timeoutChecker) throws IOException {
List<Map<String, ?>> sampleRecords = new ArrayList<>();
@ -43,10 +43,10 @@ public class JsonFileStructureFinder implements FileStructureFinder {
XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY, DeprecationHandler.THROW_UNSUPPORTED_OPERATION,
sampleMessage);
sampleRecords.add(parser.mapOrdered());
timeoutChecker.check("JSON parsing");
timeoutChecker.check("NDJSON parsing");
}
FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.JSON)
FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.NDJSON)
.setCharset(charsetName)
.setHasByteOrderMarker(hasByteOrderMarker)
.setSampleStart(sampleMessages.stream().limit(2).collect(Collectors.joining("\n", "", "\n")))
@ -84,10 +84,10 @@ public class JsonFileStructureFinder implements FileStructureFinder {
.setExplanation(explanation)
.build();
return new JsonFileStructureFinder(sampleMessages, structure);
return new NdJsonFileStructureFinder(sampleMessages, structure);
}
private JsonFileStructureFinder(List<String> sampleMessages, FileStructure structure) {
private NdJsonFileStructureFinder(List<String> sampleMessages, FileStructure structure) {
this.sampleMessages = Collections.unmodifiableList(sampleMessages);
this.structure = structure;
}

View File

@ -17,15 +17,15 @@ import java.util.Locale;
import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
public class JsonFileStructureFinderFactory implements FileStructureFinderFactory {
public class NdJsonFileStructureFinderFactory implements FileStructureFinderFactory {
@Override
public boolean canFindFormat(FileStructure.Format format) {
return format == null || format == FileStructure.Format.JSON;
return format == null || format == FileStructure.Format.NDJSON;
}
/**
* This format matches if the sample consists of one or more JSON documents.
* This format matches if the sample consists of one or more NDJSON documents.
* If there is more than one, they must be newline-delimited. The
* documents must be non-empty, to prevent lines containing "{}" from matching.
*/
@ -41,35 +41,35 @@ public class JsonFileStructureFinderFactory implements FileStructureFinderFactor
DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader(sampleLine))) {
if (parser.map().isEmpty()) {
explanation.add("Not JSON because an empty object was parsed: [" + sampleLine + "]");
explanation.add("Not NDJSON because an empty object was parsed: [" + sampleLine + "]");
return false;
}
++completeDocCount;
if (parser.nextToken() != null) {
explanation.add("Not newline delimited JSON because a line contained more than a single object: [" +
explanation.add("Not newline delimited NDJSON because a line contained more than a single object: [" +
sampleLine + "]");
return false;
}
}
}
} catch (IOException | IllegalStateException e) {
explanation.add("Not JSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
explanation.add("Not NDJSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
return false;
}
if (completeDocCount == 0) {
explanation.add("Not JSON because sample didn't contain a complete document");
explanation.add("Not NDJSON because sample didn't contain a complete document");
return false;
}
explanation.add("Deciding sample is newline delimited JSON");
explanation.add("Deciding sample is newline delimited NDJSON");
return true;
}
@Override
public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException {
return JsonFileStructureFinder.makeJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides,
return NdJsonFileStructureFinder.makeNdJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides,
timeoutChecker);
}

View File

@ -12,7 +12,7 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC
private FileStructureFinderFactory semiColonDelimitedfactory = new DelimitedFileStructureFinderFactory(';', '"', 4, false);
private FileStructureFinderFactory pipeDelimitedFactory = new DelimitedFileStructureFinderFactory('|', '"', 5, true);
// CSV - no need to check JSON or XML because they come earlier in the order we check formats
// CSV - no need to check NDJSON or XML because they come earlier in the order we check formats
public void testCanCreateCsvFromSampleGivenCsv() {
@ -39,7 +39,7 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC
assertFalse(csvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
}
// TSV - no need to check JSON, XML or CSV because they come earlier in the order we check formats
// TSV - no need to check NDJSON, XML or CSV because they come earlier in the order we check formats
public void testCanCreateTsvFromSampleGivenTsv() {
@ -61,7 +61,7 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC
assertFalse(tsvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
}
// Semi-colon delimited - no need to check JSON, XML, CSV or TSV because they come earlier in the order we check formats
// Semi-colon delimited - no need to check NDJSON, XML, CSV or TSV because they come earlier in the order we check formats
public void testCanCreateSemiColonDelimitedFromSampleGivenSemiColonDelimited() {
@ -78,7 +78,7 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC
assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, TEXT_SAMPLE));
}
// Pipe delimited - no need to check JSON, XML, CSV, TSV or semi-colon delimited
// Pipe delimited - no need to check NDJSON, XML, CSV, TSV or semi-colon delimited
// values because they come earlier in the order we check formats
public void testCanCreatePipeDelimitedFromSampleGivenPipeDelimited() {

View File

@ -73,20 +73,20 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase {
}
}
public void testMakeBestStructureGivenJson() throws Exception {
assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(JsonFileStructureFinder.class));
public void testMakeBestStructureGivenNdJson() throws Exception {
assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(),
randomBoolean(), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(NdJsonFileStructureFinder.class));
}
public void testMakeBestStructureGivenJsonAndDelimitedOverride() throws Exception {
public void testMakeBestStructureGivenNdJsonAndDelimitedOverride() throws Exception {
// Need to change the quote character from the default of double quotes
// otherwise the quotes in the JSON will stop it parsing as CSV
// otherwise the quotes in the NDJSON will stop it parsing as CSV
FileStructureOverrides overrides = FileStructureOverrides.builder()
.setFormat(FileStructure.Format.DELIMITED).setQuote('\'').build();
assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class));
assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(),
randomBoolean(), overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class));
}
public void testMakeBestStructureGivenXml() throws Exception {
@ -109,13 +109,13 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase {
public void testMakeBestStructureGivenCsvAndJsonOverride() {
FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.JSON).build();
FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.NDJSON).build();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
overrides, NOOP_TIMEOUT_CHECKER));
assertEquals("Input did not match the specified format [json]", e.getMessage());
assertEquals("Input did not match the specified format [ndjson]", e.getMessage());
}
public void testMakeBestStructureGivenText() throws Exception {

View File

@ -27,7 +27,7 @@ public abstract class FileStructureTestCase extends ESTestCase {
"2018-05-17T16:23:40,key1,42.0\n" +
"2018-05-17T16:24:11,\"key with spaces\",42.0\n";
protected static final String JSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
protected static final String NDJSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
"\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 1\",\"class\":\"ml\"," +
"\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n" +
"{\"logger\":\"controller\",\"timestamp\":1478261151445," +

View File

@ -9,21 +9,21 @@ import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
import java.util.Collections;
public class JsonFileStructureFinderTests extends FileStructureTestCase {
public class NdJsonFileStructureFinderTests extends FileStructureTestCase {
private FileStructureFinderFactory factory = new JsonFileStructureFinderFactory();
private FileStructureFinderFactory factory = new NdJsonFileStructureFinderFactory();
public void testCreateConfigsGivenGoodJson() throws Exception {
assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE));
String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker,
FileStructureFinder structureFinder = factory.createFromSample(explanation, NDJSON_SAMPLE, charset, hasByteOrderMarker,
FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
FileStructure structure = structureFinder.getStructure();
assertEquals(FileStructure.Format.JSON, structure.getFormat());
assertEquals(FileStructure.Format.NDJSON, structure.getFormat());
assertEquals(charset, structure.getCharset());
if (hasByteOrderMarker == null) {
assertNull(structure.getHasByteOrderMarker());

View File

@ -5,13 +5,13 @@
*/
package org.elasticsearch.xpack.ml.filestructurefinder;
public class JsonFileStructureFinderFactoryTests extends FileStructureTestCase {
public class NdNdJsonFileStructureFinderFactoryTests extends FileStructureTestCase {
private FileStructureFinderFactory factory = new JsonFileStructureFinderFactory();
private FileStructureFinderFactory factory = new NdJsonFileStructureFinderFactory();
public void testCanCreateFromSampleGivenJson() {
public void testCanCreateFromSampleGivenNdJson() {
assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE));
}
public void testCanCreateFromSampleGivenXml() {

View File

@ -9,7 +9,7 @@ public class TextLogFileStructureFinderFactoryTests extends FileStructureTestCas
private FileStructureFinderFactory factory = new TextLogFileStructureFinderFactory();
// No need to check JSON, XML, CSV, TSV, semi-colon delimited values or pipe
// No need to check NDJSON, XML, CSV, TSV, semi-colon delimited values or pipe
// delimited values because they come earlier in the order we check formats
public void testCanCreateFromSampleGivenText() {

View File

@ -9,7 +9,7 @@ public class XmlFileStructureFinderFactoryTests extends FileStructureTestCase {
private FileStructureFinderFactory factory = new XmlFileStructureFinderFactory();
// No need to check JSON because it comes earlier in the order we check formats
// No need to check NDJSON because it comes earlier in the order we check formats
public void testCanCreateFromSampleGivenXml() {

View File

@ -22,7 +22,7 @@
},
"format": {
"type": "enum",
"options": [ "json", "xml", "delimited", "semi_structured_text" ],
"options": [ "ndjson", "xml", "delimited", "semi_structured_text" ],
"description": "Optional parameter to specify the high level file format"
},
"has_header_row": {

View File

@ -1,5 +1,5 @@
---
"Test JSON file structure analysis without overrides":
"Test NDJSON file structure analysis without overrides":
- do:
headers:
# This is to stop the usual content type randomization, which
@ -26,7 +26,7 @@
- match: { num_messages_analyzed: 3 }
- match: { charset: "UTF-8" }
- match: { has_byte_order_marker: false }
- match: { format: json }
- match: { format: ndjson }
- match: { timestamp_field: time }
- match: { joda_timestamp_formats.0: UNIX }
- match: { java_timestamp_formats.0: UNIX }
@ -56,7 +56,7 @@
- is_false: explanation
---
"Test JSON file structure analysis with overrides":
"Test NDJSON file structure analysis with overrides":
- do:
headers:
# This is to stop the usual content type randomization, which
@ -64,7 +64,7 @@
Content-Type: "application/json"
xpack.ml.find_file_structure:
charset: UTF-8
format: json
format: ndjson
timestamp_field: time
timestamp_format: UNIX
explain: true
@ -86,7 +86,7 @@
- match: { num_messages_analyzed: 3 }
- match: { charset: "UTF-8" }
- match: { has_byte_order_marker: false }
- match: { format: json }
- match: { format: ndjson }
- match: { timestamp_field: time }
- match: { joda_timestamp_formats.0: UNIX }
- match: { java_timestamp_formats.0: UNIX }