[ML] add version information in case of crash of native ML process (#30674)

This change adds version information in case a native ML process crashes, the version is important for choosing the right symbol files when analyzing the crash. Adding the version combines all necessary information on one line.

relates elastic/ml-cpp#94
This commit is contained in:
Hendrik Muhs 2018-05-18 07:46:52 +02:00 committed by GitHub
parent 443c7014ba
commit d893041634
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 38 additions and 18 deletions

View File

@ -22,8 +22,6 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeoutException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
@ -84,20 +82,7 @@ public class NativeController {
}
public Map<String, Object> getNativeCodeInfo() throws TimeoutException {
String copyrightMessage = cppLogHandler.getCppCopyright(CONTROLLER_CONNECT_TIMEOUT);
Matcher matcher = Pattern.compile("Version (.+) \\(Build ([^)]+)\\) Copyright ").matcher(copyrightMessage);
if (matcher.find()) {
Map<String, Object> info = new HashMap<>(2);
info.put("version", matcher.group(1));
info.put("build_hash", matcher.group(2));
return info;
} else {
// If this happens it probably means someone has changed the format in lib/ver/CBuildInfo.cc
// in the machine-learning-cpp repo without changing the pattern above to match
String msg = "Unexpected native controller process copyright format: " + copyrightMessage;
LOGGER.error(msg);
throw new ElasticsearchException(msg);
}
return cppLogHandler.getNativeCodeInfo(CONTROLLER_CONNECT_TIMEOUT);
}
public void startProcess(List<String> command) throws IOException {

View File

@ -8,7 +8,7 @@ package org.elasticsearch.xpack.ml.job.process.logging;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
@ -30,10 +30,15 @@ import java.time.Duration;
import java.time.Instant;
import java.time.temporal.ChronoUnit;
import java.util.Deque;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Handle a stream of C++ log messages that arrive via a named pipe in JSON format.
@ -181,6 +186,26 @@ public class CppLogMessageHandler implements Closeable {
return cppCopyright;
}
/**
* Extracts version information from the copyright string which assumes a certain format.
*/
public Map<String, Object> getNativeCodeInfo(Duration timeout) throws TimeoutException {
String copyrightMessage = getCppCopyright(timeout);
Matcher matcher = Pattern.compile("Version (.+) \\(Build ([^)]+)\\) Copyright ").matcher(copyrightMessage);
if (matcher.find()) {
Map<String, Object> info = new HashMap<>(2);
info.put("version", matcher.group(1));
info.put("build_hash", matcher.group(2));
return info;
} else {
// If this happens it probably means someone has changed the format in lib/ver/CBuildInfo.cc
// in the ml-cpp repo without changing the pattern above to match
String msg = "Unexpected native process copyright format: " + copyrightMessage;
LOGGER.error(msg);
throw new ElasticsearchException(msg);
}
}
/**
* Expected to be called very infrequently.
*/
@ -281,8 +306,18 @@ public class CppLogMessageHandler implements Closeable {
} catch (XContentParseException e) {
String upstreamMessage = "Fatal error: '" + bytesRef.utf8ToString() + "'";
if (upstreamMessage.contains("bad_alloc")) {
upstreamMessage += ", process ran out of memory.";
upstreamMessage += ", process ran out of memory";
}
// add version information, so it's conveniently next to the crash log
upstreamMessage += ", version: ";
try {
Map<String, Object> versionInfo = getNativeCodeInfo(Duration.ofMillis(10));
upstreamMessage += String.format(Locale.ROOT, "%s (build %s)", versionInfo.get("version"), versionInfo.get("build_hash"));
} catch (TimeoutException timeoutException) {
upstreamMessage += "failed to retrieve";
}
storeError(upstreamMessage);
seenFatalError = true;
} catch (IOException e) {