From 132a1c9fe770209792973f52b8aee6f36f06aa3d Mon Sep 17 00:00:00 2001 From: Joseph Glanville Date: Fri, 22 May 2020 06:29:39 +0700 Subject: [PATCH] Re-order and document format detection in web console (#9887) Motivation for this change is to not inadvertently identify binary formats that contain uncompressed string data as TSV or CSV. Moving detection of magic byte headers before heuristics should be more robust in general. --- web-console/src/utils/ingestion-spec.tsx | 32 ++++++++++++++---------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/web-console/src/utils/ingestion-spec.tsx b/web-console/src/utils/ingestion-spec.tsx index 0bc3ff3739d..9a2d70058c2 100644 --- a/web-console/src/utils/ingestion-spec.tsx +++ b/web-console/src/utils/ingestion-spec.tsx @@ -2676,29 +2676,35 @@ function guessInputFormat(sampleData: string[]): InputFormat { if (sampleDatum) { sampleDatum = String(sampleDatum); // Really ensure it is a string - if (sampleDatum.startsWith('{') && sampleDatum.endsWith('}')) { - return inputFormatFromType('json'); - } - - if (sampleDatum.split('\t').length > 3) { - return inputFormatFromType('tsv', !/\t\d+\t/.test(sampleDatum)); - } - - if (sampleDatum.split(',').length > 3) { - return inputFormatFromType('csv', !/,\d+,/.test(sampleDatum)); - } + // First check for magic byte sequences as they rarely yield false positives + // Parquet 4 byte magic header: https://github.com/apache/parquet-format#file-format if (sampleDatum.startsWith('PAR1')) { return inputFormatFromType('parquet'); } - + // ORC 3 byte magic header: https://orc.apache.org/specification/ORCv1/ if (sampleDatum.startsWith('ORC')) { return inputFormatFromType('orc'); } - + // Avro OCF 4 byte magic header: https://avro.apache.org/docs/current/spec.html#Object+Container+Files if (sampleDatum.startsWith('Obj1')) { return inputFormatFromType('avro_ocf'); } + + // After checking for magic byte sequences perform heuristics to deduce string formats + + // If the string starts and ends with curly braces assume JSON + if (sampleDatum.startsWith('{') && sampleDatum.endsWith('}')) { + return inputFormatFromType('json'); + } + // Contains more than 3 tabs assume TSV + if (sampleDatum.split('\t').length > 3) { + return inputFormatFromType('tsv', !/\t\d+\t/.test(sampleDatum)); + } + // Contains more than 3 commas assume CSV + if (sampleDatum.split(',').length > 3) { + return inputFormatFromType('csv', !/,\d+,/.test(sampleDatum)); + } } return inputFormatFromType('regex');