mirror of https://github.com/apache/druid.git
Re-order and document format detection in web console (#9887)
Motivation for this change is to not inadvertently identify binary formats that contain uncompressed string data as TSV or CSV. Moving detection of magic byte headers before heuristics should be more robust in general.
This commit is contained in:
parent
63baa29ad1
commit
132a1c9fe7
|
@ -2676,29 +2676,35 @@ function guessInputFormat(sampleData: string[]): InputFormat {
|
||||||
if (sampleDatum) {
|
if (sampleDatum) {
|
||||||
sampleDatum = String(sampleDatum); // Really ensure it is a string
|
sampleDatum = String(sampleDatum); // Really ensure it is a string
|
||||||
|
|
||||||
if (sampleDatum.startsWith('{') && sampleDatum.endsWith('}')) {
|
// First check for magic byte sequences as they rarely yield false positives
|
||||||
return inputFormatFromType('json');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (sampleDatum.split('\t').length > 3) {
|
|
||||||
return inputFormatFromType('tsv', !/\t\d+\t/.test(sampleDatum));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (sampleDatum.split(',').length > 3) {
|
|
||||||
return inputFormatFromType('csv', !/,\d+,/.test(sampleDatum));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// Parquet 4 byte magic header: https://github.com/apache/parquet-format#file-format
|
||||||
if (sampleDatum.startsWith('PAR1')) {
|
if (sampleDatum.startsWith('PAR1')) {
|
||||||
return inputFormatFromType('parquet');
|
return inputFormatFromType('parquet');
|
||||||
}
|
}
|
||||||
|
// ORC 3 byte magic header: https://orc.apache.org/specification/ORCv1/
|
||||||
if (sampleDatum.startsWith('ORC')) {
|
if (sampleDatum.startsWith('ORC')) {
|
||||||
return inputFormatFromType('orc');
|
return inputFormatFromType('orc');
|
||||||
}
|
}
|
||||||
|
// Avro OCF 4 byte magic header: https://avro.apache.org/docs/current/spec.html#Object+Container+Files
|
||||||
if (sampleDatum.startsWith('Obj1')) {
|
if (sampleDatum.startsWith('Obj1')) {
|
||||||
return inputFormatFromType('avro_ocf');
|
return inputFormatFromType('avro_ocf');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// After checking for magic byte sequences perform heuristics to deduce string formats
|
||||||
|
|
||||||
|
// If the string starts and ends with curly braces assume JSON
|
||||||
|
if (sampleDatum.startsWith('{') && sampleDatum.endsWith('}')) {
|
||||||
|
return inputFormatFromType('json');
|
||||||
|
}
|
||||||
|
// Contains more than 3 tabs assume TSV
|
||||||
|
if (sampleDatum.split('\t').length > 3) {
|
||||||
|
return inputFormatFromType('tsv', !/\t\d+\t/.test(sampleDatum));
|
||||||
|
}
|
||||||
|
// Contains more than 3 commas assume CSV
|
||||||
|
if (sampleDatum.split(',').length > 3) {
|
||||||
|
return inputFormatFromType('csv', !/,\d+,/.test(sampleDatum));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return inputFormatFromType('regex');
|
return inputFormatFromType('regex');
|
||||||
|
|
Loading…
Reference in New Issue