Restart running monitors if no heartbeat (#3952)
This commit is contained in:
parent
9f170a68d7
commit
c43223a16d
|
@ -3,7 +3,7 @@ const dayjs = require("dayjs");
|
||||||
const axios = require("axios");
|
const axios = require("axios");
|
||||||
const { Prometheus } = require("../prometheus");
|
const { Prometheus } = require("../prometheus");
|
||||||
const { log, UP, DOWN, PENDING, MAINTENANCE, flipStatus, TimeLogger, MAX_INTERVAL_SECOND, MIN_INTERVAL_SECOND,
|
const { log, UP, DOWN, PENDING, MAINTENANCE, flipStatus, TimeLogger, MAX_INTERVAL_SECOND, MIN_INTERVAL_SECOND,
|
||||||
SQL_DATETIME_FORMAT
|
SQL_DATETIME_FORMAT, isDev, sleep, getRandomInt
|
||||||
} = require("../../src/util");
|
} = require("../../src/util");
|
||||||
const { tcping, ping, dnsResolve, checkCertificate, checkStatusCode, getTotalClientInRoom, setting, mssqlQuery, postgresQuery, mysqlQuery, mqttAsync, setSetting, httpNtlm, radius, grpcQuery,
|
const { tcping, ping, dnsResolve, checkCertificate, checkStatusCode, getTotalClientInRoom, setting, mssqlQuery, postgresQuery, mysqlQuery, mqttAsync, setSetting, httpNtlm, radius, grpcQuery,
|
||||||
redisPingAsync, mongodbPing, kafkaProducerAsync, getOidcTokenClientCredentials, rootCertificatesFingerprints
|
redisPingAsync, mongodbPing, kafkaProducerAsync, getOidcTokenClientCredentials, rootCertificatesFingerprints
|
||||||
|
@ -328,6 +328,16 @@ class Monitor extends BeanModel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Evil
|
||||||
|
if (isDev) {
|
||||||
|
if (process.env.EVIL_RANDOM_MONITOR_SLEEP === "SURE") {
|
||||||
|
if (getRandomInt(0, 100) === 0) {
|
||||||
|
log.debug("evil", `[${this.name}] Evil mode: Random sleep: ` + beatInterval * 10000);
|
||||||
|
await sleep(beatInterval * 10000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Expose here for prometheus update
|
// Expose here for prometheus update
|
||||||
// undefined if not https
|
// undefined if not https
|
||||||
let tlsInfo = undefined;
|
let tlsInfo = undefined;
|
||||||
|
@ -995,6 +1005,7 @@ class Monitor extends BeanModel {
|
||||||
if (! this.isStop) {
|
if (! this.isStop) {
|
||||||
log.debug("monitor", `[${this.name}] SetTimeout for next check.`);
|
log.debug("monitor", `[${this.name}] SetTimeout for next check.`);
|
||||||
this.heartbeatInterval = setTimeout(safeBeat, beatInterval * 1000);
|
this.heartbeatInterval = setTimeout(safeBeat, beatInterval * 1000);
|
||||||
|
this.lastScheduleBeatTime = dayjs();
|
||||||
} else {
|
} else {
|
||||||
log.info("monitor", `[${this.name}] isStop = true, no next check.`);
|
log.info("monitor", `[${this.name}] isStop = true, no next check.`);
|
||||||
}
|
}
|
||||||
|
@ -1004,7 +1015,9 @@ class Monitor extends BeanModel {
|
||||||
/** Get a heartbeat and handle errors */
|
/** Get a heartbeat and handle errors */
|
||||||
const safeBeat = async () => {
|
const safeBeat = async () => {
|
||||||
try {
|
try {
|
||||||
|
this.lastStartBeatTime = dayjs();
|
||||||
await beat();
|
await beat();
|
||||||
|
this.lastEndBeatTime = dayjs();
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.trace(e);
|
console.trace(e);
|
||||||
UptimeKumaServer.errorLog(e, false);
|
UptimeKumaServer.errorLog(e, false);
|
||||||
|
@ -1013,6 +1026,9 @@ class Monitor extends BeanModel {
|
||||||
if (! this.isStop) {
|
if (! this.isStop) {
|
||||||
log.info("monitor", "Try to restart the monitor");
|
log.info("monitor", "Try to restart the monitor");
|
||||||
this.heartbeatInterval = setTimeout(safeBeat, this.interval * 1000);
|
this.heartbeatInterval = setTimeout(safeBeat, this.interval * 1000);
|
||||||
|
this.lastScheduleBeatTime = dayjs();
|
||||||
|
} else {
|
||||||
|
log.info("monitor", "isStop = true, no next check.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -12,6 +12,7 @@ const { Settings } = require("./settings");
|
||||||
const dayjs = require("dayjs");
|
const dayjs = require("dayjs");
|
||||||
const childProcess = require("child_process");
|
const childProcess = require("child_process");
|
||||||
const path = require("path");
|
const path = require("path");
|
||||||
|
const axios = require("axios");
|
||||||
// DO NOT IMPORT HERE IF THE MODULES USED `UptimeKumaServer.getInstance()`, put at the bottom of this file instead.
|
// DO NOT IMPORT HERE IF THE MODULES USED `UptimeKumaServer.getInstance()`, put at the bottom of this file instead.
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -62,6 +63,8 @@ class UptimeKumaServer {
|
||||||
*/
|
*/
|
||||||
jwtSecret = null;
|
jwtSecret = null;
|
||||||
|
|
||||||
|
checkMonitorsInterval = null;
|
||||||
|
|
||||||
static getInstance(args) {
|
static getInstance(args) {
|
||||||
if (UptimeKumaServer.instance == null) {
|
if (UptimeKumaServer.instance == null) {
|
||||||
UptimeKumaServer.instance = new UptimeKumaServer(args);
|
UptimeKumaServer.instance = new UptimeKumaServer(args);
|
||||||
|
@ -75,6 +78,9 @@ class UptimeKumaServer {
|
||||||
const sslCert = args["ssl-cert"] || process.env.UPTIME_KUMA_SSL_CERT || process.env.SSL_CERT || undefined;
|
const sslCert = args["ssl-cert"] || process.env.UPTIME_KUMA_SSL_CERT || process.env.SSL_CERT || undefined;
|
||||||
const sslKeyPassphrase = args["ssl-key-passphrase"] || process.env.UPTIME_KUMA_SSL_KEY_PASSPHRASE || process.env.SSL_KEY_PASSPHRASE || undefined;
|
const sslKeyPassphrase = args["ssl-key-passphrase"] || process.env.UPTIME_KUMA_SSL_KEY_PASSPHRASE || process.env.SSL_KEY_PASSPHRASE || undefined;
|
||||||
|
|
||||||
|
// Set default axios timeout to 5 minutes instead of infinity
|
||||||
|
axios.defaults.timeout = 300 * 1000;
|
||||||
|
|
||||||
log.info("server", "Creating express and socket.io instance");
|
log.info("server", "Creating express and socket.io instance");
|
||||||
this.app = express();
|
this.app = express();
|
||||||
if (sslKey && sslCert) {
|
if (sslKey && sslCert) {
|
||||||
|
@ -346,6 +352,10 @@ class UptimeKumaServer {
|
||||||
if (enable || enable === null) {
|
if (enable || enable === null) {
|
||||||
this.startNSCDServices();
|
this.startNSCDServices();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.checkMonitorsInterval = setInterval(() => {
|
||||||
|
this.checkMonitors();
|
||||||
|
}, 60 * 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -358,6 +368,8 @@ class UptimeKumaServer {
|
||||||
if (enable || enable === null) {
|
if (enable || enable === null) {
|
||||||
this.stopNSCDServices();
|
this.stopNSCDServices();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
clearInterval(this.checkMonitorsInterval);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -388,6 +400,83 @@ class UptimeKumaServer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start the specified monitor
|
||||||
|
* @param {number} monitorID ID of monitor to start
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
|
async startMonitor(monitorID) {
|
||||||
|
log.info("manage", `Resume Monitor: ${monitorID} by server`);
|
||||||
|
|
||||||
|
await R.exec("UPDATE monitor SET active = 1 WHERE id = ?", [
|
||||||
|
monitorID,
|
||||||
|
]);
|
||||||
|
|
||||||
|
let monitor = await R.findOne("monitor", " id = ? ", [
|
||||||
|
monitorID,
|
||||||
|
]);
|
||||||
|
|
||||||
|
if (monitor.id in this.monitorList) {
|
||||||
|
this.monitorList[monitor.id].stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
this.monitorList[monitor.id] = monitor;
|
||||||
|
monitor.start(this.io);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Restart a given monitor
|
||||||
|
* @param {number} monitorID ID of monitor to start
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
|
async restartMonitor(monitorID) {
|
||||||
|
return await this.startMonitor(monitorID);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if monitors are running properly
|
||||||
|
*/
|
||||||
|
async checkMonitors() {
|
||||||
|
log.debug("monitor_checker", "Checking monitors");
|
||||||
|
|
||||||
|
for (let monitorID in this.monitorList) {
|
||||||
|
let monitor = this.monitorList[monitorID];
|
||||||
|
|
||||||
|
// Not for push monitor
|
||||||
|
if (monitor.type === "push") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!monitor.active) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check the lastStartBeatTime, if it is too long, then restart
|
||||||
|
if (monitor.lastScheduleBeatTime ) {
|
||||||
|
let diff = dayjs().diff(monitor.lastStartBeatTime, "second");
|
||||||
|
|
||||||
|
if (diff > monitor.interval * 1.5) {
|
||||||
|
log.error("monitor_checker", `Monitor Interval: ${monitor.interval} Monitor ` + monitorID + " lastStartBeatTime diff: " + diff);
|
||||||
|
log.error("monitor_checker", "Unexpected error: Monitor " + monitorID + " is struck for unknown reason");
|
||||||
|
log.error("monitor_checker", "Last start beat time: " + R.isoDateTime(monitor.lastStartBeatTime));
|
||||||
|
log.error("monitor_checker", "Last end beat time: " + R.isoDateTime(monitor.lastEndBeatTime));
|
||||||
|
log.error("monitor_checker", "Last ScheduleBeatTime: " + R.isoDateTime(monitor.lastScheduleBeatTime));
|
||||||
|
|
||||||
|
// Restart
|
||||||
|
log.error("monitor_checker", `Restarting monitor ${monitorID} automatically now`);
|
||||||
|
this.restartMonitor(monitorID);
|
||||||
|
} else {
|
||||||
|
//log.debug("monitor_checker", "Monitor " + monitorID + " is running normally");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
//log.debug("monitor_checker", "Monitor " + monitorID + " is not started yet, skipp");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
log.debug("monitor_checker", "Checking monitors end");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
|
Loading…
Reference in New Issue