Re-order and document format detection in web console (#9887)

Motivation for this change is to not inadvertently identify binary
formats that contain uncompressed string data as TSV or CSV.

Moving detection of magic byte headers before heuristics should be more
robust in general.
This commit is contained in:
Joseph Glanville 2020-05-22 06:29:39 +07:00 committed by GitHub
parent 63baa29ad1
commit 132a1c9fe7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 19 additions and 13 deletions

View File

@ -2676,29 +2676,35 @@ function guessInputFormat(sampleData: string[]): InputFormat {
if (sampleDatum) {
sampleDatum = String(sampleDatum); // Really ensure it is a string
if (sampleDatum.startsWith('{') && sampleDatum.endsWith('}')) {
return inputFormatFromType('json');
}
if (sampleDatum.split('\t').length > 3) {
return inputFormatFromType('tsv', !/\t\d+\t/.test(sampleDatum));
}
if (sampleDatum.split(',').length > 3) {
return inputFormatFromType('csv', !/,\d+,/.test(sampleDatum));
}
// First check for magic byte sequences as they rarely yield false positives
// Parquet 4 byte magic header: https://github.com/apache/parquet-format#file-format
if (sampleDatum.startsWith('PAR1')) {
return inputFormatFromType('parquet');
}
// ORC 3 byte magic header: https://orc.apache.org/specification/ORCv1/
if (sampleDatum.startsWith('ORC')) {
return inputFormatFromType('orc');
}
// Avro OCF 4 byte magic header: https://avro.apache.org/docs/current/spec.html#Object+Container+Files
if (sampleDatum.startsWith('Obj1')) {
return inputFormatFromType('avro_ocf');
}
// After checking for magic byte sequences perform heuristics to deduce string formats
// If the string starts and ends with curly braces assume JSON
if (sampleDatum.startsWith('{') && sampleDatum.endsWith('}')) {
return inputFormatFromType('json');
}
// Contains more than 3 tabs assume TSV
if (sampleDatum.split('\t').length > 3) {
return inputFormatFromType('tsv', !/\t\d+\t/.test(sampleDatum));
}
// Contains more than 3 commas assume CSV
if (sampleDatum.split(',').length > 3) {
return inputFormatFromType('csv', !/,\d+,/.test(sampleDatum));
}
}
return inputFormatFromType('regex');