[ML] add version information in case of crash of native ML process (#30674)
This change adds version information in case a native ML process crashes, the version is important for choosing the right symbol files when analyzing the crash. Adding the version combines all necessary information on one line. relates elastic/ml-cpp#94
This commit is contained in:
parent
443c7014ba
commit
d893041634
|
@ -22,8 +22,6 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -84,20 +82,7 @@ public class NativeController {
|
|||
}
|
||||
|
||||
public Map<String, Object> getNativeCodeInfo() throws TimeoutException {
|
||||
String copyrightMessage = cppLogHandler.getCppCopyright(CONTROLLER_CONNECT_TIMEOUT);
|
||||
Matcher matcher = Pattern.compile("Version (.+) \\(Build ([^)]+)\\) Copyright ").matcher(copyrightMessage);
|
||||
if (matcher.find()) {
|
||||
Map<String, Object> info = new HashMap<>(2);
|
||||
info.put("version", matcher.group(1));
|
||||
info.put("build_hash", matcher.group(2));
|
||||
return info;
|
||||
} else {
|
||||
// If this happens it probably means someone has changed the format in lib/ver/CBuildInfo.cc
|
||||
// in the machine-learning-cpp repo without changing the pattern above to match
|
||||
String msg = "Unexpected native controller process copyright format: " + copyrightMessage;
|
||||
LOGGER.error(msg);
|
||||
throw new ElasticsearchException(msg);
|
||||
}
|
||||
return cppLogHandler.getNativeCodeInfo(CONTROLLER_CONNECT_TIMEOUT);
|
||||
}
|
||||
|
||||
public void startProcess(List<String> command) throws IOException {
|
||||
|
|
|
@ -8,7 +8,7 @@ package org.elasticsearch.xpack.ml.job.process.logging;
|
|||
import org.apache.logging.log4j.Level;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.apache.logging.log4j.message.ParameterizedMessage;
|
||||
import org.elasticsearch.common.ParsingException;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.bytes.BytesArray;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
|
@ -30,10 +30,15 @@ import java.time.Duration;
|
|||
import java.time.Instant;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
import java.util.Deque;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Handle a stream of C++ log messages that arrive via a named pipe in JSON format.
|
||||
|
@ -181,6 +186,26 @@ public class CppLogMessageHandler implements Closeable {
|
|||
return cppCopyright;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts version information from the copyright string which assumes a certain format.
|
||||
*/
|
||||
public Map<String, Object> getNativeCodeInfo(Duration timeout) throws TimeoutException {
|
||||
String copyrightMessage = getCppCopyright(timeout);
|
||||
Matcher matcher = Pattern.compile("Version (.+) \\(Build ([^)]+)\\) Copyright ").matcher(copyrightMessage);
|
||||
if (matcher.find()) {
|
||||
Map<String, Object> info = new HashMap<>(2);
|
||||
info.put("version", matcher.group(1));
|
||||
info.put("build_hash", matcher.group(2));
|
||||
return info;
|
||||
} else {
|
||||
// If this happens it probably means someone has changed the format in lib/ver/CBuildInfo.cc
|
||||
// in the ml-cpp repo without changing the pattern above to match
|
||||
String msg = "Unexpected native process copyright format: " + copyrightMessage;
|
||||
LOGGER.error(msg);
|
||||
throw new ElasticsearchException(msg);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Expected to be called very infrequently.
|
||||
*/
|
||||
|
@ -281,8 +306,18 @@ public class CppLogMessageHandler implements Closeable {
|
|||
} catch (XContentParseException e) {
|
||||
String upstreamMessage = "Fatal error: '" + bytesRef.utf8ToString() + "'";
|
||||
if (upstreamMessage.contains("bad_alloc")) {
|
||||
upstreamMessage += ", process ran out of memory.";
|
||||
upstreamMessage += ", process ran out of memory";
|
||||
}
|
||||
|
||||
// add version information, so it's conveniently next to the crash log
|
||||
upstreamMessage += ", version: ";
|
||||
try {
|
||||
Map<String, Object> versionInfo = getNativeCodeInfo(Duration.ofMillis(10));
|
||||
upstreamMessage += String.format(Locale.ROOT, "%s (build %s)", versionInfo.get("version"), versionInfo.get("build_hash"));
|
||||
} catch (TimeoutException timeoutException) {
|
||||
upstreamMessage += "failed to retrieve";
|
||||
}
|
||||
|
||||
storeError(upstreamMessage);
|
||||
seenFatalError = true;
|
||||
} catch (IOException e) {
|
||||
|
|
Loading…
Reference in New Issue