From 22415fa2de1d7d07cea7dd5e7263eb1ed4270503 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Wed, 29 Aug 2018 14:56:02 +0100 Subject: [PATCH] [ML] Fix character set finder bug with unencodable charsets (#33234) Some character sets cannot be encoded and this was tripping up the binary data check in the ML log structure character set finder. The fix is to assume that if ICU4J identifies that some bytes correspond to a character set that cannot be encoded and those bytes contain zeroes then the data is binary rather than text. Fixes #33227 --- .../LogStructureFinderManager.java | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java index 7f18445e505..a8fd9d7eb89 100644 --- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java +++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java @@ -163,9 +163,15 @@ public final class LogStructureFinderManager { // deduction algorithms on binary files is very slow as the binary files generally appear to // have very long lines. boolean spaceEncodingContainsZeroByte = false; - byte[] spaceBytes = " ".getBytes(name); - for (int i = 0; i < spaceBytes.length && spaceEncodingContainsZeroByte == false; ++i) { - spaceEncodingContainsZeroByte = (spaceBytes[i] == 0); + Charset charset = Charset.forName(name); + // Some character sets cannot be encoded. These are extremely rare so it's likely that + // they've been chosen based on incorrectly provided binary data. Therefore, err on + // the side of rejecting binary data. + if (charset.canEncode()) { + byte[] spaceBytes = " ".getBytes(charset); + for (int i = 0; i < spaceBytes.length && spaceEncodingContainsZeroByte == false; ++i) { + spaceEncodingContainsZeroByte = (spaceBytes[i] == 0); + } } if (containsZeroBytes && spaceEncodingContainsZeroByte == false) { explanation.add("Character encoding [" + name + "] matched the input with [" + charsetMatch.getConfidence() +