public inbox for gentoo-commits@lists.gentoo.org
 help / color / mirror / Atom feed
* [gentoo-commits] proj/openrc:master commit in: /, src/rc/, man/, sh/
@ 2018-10-23 22:00 William Hubbs
  0 siblings, 0 replies; only message in thread
From: William Hubbs @ 2018-10-23 22:00 UTC (permalink / raw
  To: gentoo-commits

commit:     c1e582586d398b4452f568240985247294f645ef
Author:     William Hubbs <w.d.hubbs <AT> gmail <DOT> com>
AuthorDate: Tue Oct  9 22:49:02 2018 +0000
Commit:     William Hubbs <williamh <AT> gentoo <DOT> org>
CommitDate: Tue Oct 23 18:38:14 2018 +0000
URL:        https://gitweb.gentoo.org/proj/openrc.git/commit/?id=c1e58258

supervise-daemon: add health checks

Health checks are a way to monitor a service and make sure it stays
healthy.

If a service is not healthy, it will be automatically restarted after
running the unhealthy() function to clean up.

 NEWS.md                   |   4 ++
 man/supervise-daemon.8    |   9 +++
 sh/supervise-daemon.sh    |  14 +++++
 src/rc/Makefile           |   2 +-
 src/rc/supervise-daemon.c | 136 +++++++++++++++++++++++++++++++++++-----------
 supervise-daemon-guide.md |  36 ++++++++++++
 6 files changed, 169 insertions(+), 32 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index d4d96577..f1400197 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -22,6 +22,10 @@ This version adds timed shutdown and cancelation of shutdown to
 openrc-shutdown. Shutdowns can now be delayed for a certain amount of
 time or scheduled for an exact time.
 
+supervise-daemon supports health checks, which are a periodic way to make sure a
+service is healthy. For more information on setting this up, please see
+supervise-daemon-guide.md.
+
 ## OpenRC 0.37
 
 start-stop-daemon now supports logging stdout and stderr of daemons to

diff --git a/man/supervise-daemon.8 b/man/supervise-daemon.8
index af06ee31..8bcd8b5c 100644
--- a/man/supervise-daemon.8
+++ b/man/supervise-daemon.8
@@ -16,6 +16,10 @@
 .Nd starts a daemon and restarts it if it crashes
 .Sh SYNOPSIS
 .Nm
+.Fl a , -healthcheck-timer
+.Ar seconds
+.Fl A , -healthcheck-delay
+.Ar seconds
 .Fl D , -respawn-delay
 .Ar seconds
 .Fl d , -chdir
@@ -90,6 +94,11 @@ Print the action(s) that are taken just before doing them.
 .Pp
 The options are as follows:
 .Bl -tag -width indent
+.Fl a , -healthcheck-timer Ar seconds
+Run the healthcheck() command, possibly followed by the unhealthy()
+command every time this number of seconds passes.
+.Fl A , -healthcheck-delay Ar seconds
+Wait this long before the first health check.
 .It Fl D , -respawn-delay Ar seconds
 wait this number of seconds before restarting a daemon after it crashes.
 The default is 0.

diff --git a/sh/supervise-daemon.sh b/sh/supervise-daemon.sh
index 80e0260c..73a70140 100644
--- a/sh/supervise-daemon.sh
+++ b/sh/supervise-daemon.sh
@@ -10,6 +10,8 @@
 # This file may not be copied, modified, propagated, or distributed
 #    except according to the terms contained in the LICENSE file.
 
+extra_commands="healthcheck unhealthy ${extra_commands}"
+
 supervise_start()
 {
 	if [ -z "$command" ]; then
@@ -32,6 +34,8 @@ supervise_start()
 		${respawn_delay:+--respawn-delay} $respawn_delay \
 		${respawn_max:+--respawn-max} $respawn_max \
 		${respawn_period:+--respawn-period} $respawn_period \
+		${healthcheck_delay:+--healthcheck-delay} $healthcheck_delay \
+		${healthcheck_timer:+--healthcheck-timer} $healthcheck_timer \
 		${command_user+--user} $command_user \
 		${umask+--umask} $umask \
 		${supervise_daemon_args:-${start_stop_daemon_args}} \
@@ -98,3 +102,13 @@ supervise_status()
 		return 3
 	fi
 }
+
+healthcheck()
+{
+	return 0
+}
+
+unhealthy()
+{
+	return 0
+}

diff --git a/src/rc/Makefile b/src/rc/Makefile
index 9ba240fa..ea4a8c81 100644
--- a/src/rc/Makefile
+++ b/src/rc/Makefile
@@ -161,7 +161,7 @@ rc-update: rc-update.o _usage.o rc-misc.o
 start-stop-daemon: start-stop-daemon.o _usage.o rc-misc.o rc-pipes.o rc-schedules.o
 	${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}
 
-supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-schedules.o
+supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-plugin.o rc-schedules.o
 	${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}
 
 service_get_value service_set_value get_options save_options: do_value.o rc-misc.o

diff --git a/src/rc/supervise-daemon.c b/src/rc/supervise-daemon.c
index 27089152..883c738d 100644
--- a/src/rc/supervise-daemon.c
+++ b/src/rc/supervise-daemon.c
@@ -61,15 +61,18 @@ static struct pam_conv conv = { NULL, NULL};
 #include "queue.h"
 #include "rc.h"
 #include "rc-misc.h"
+#include "rc-plugin.h"
 #include "rc-schedules.h"
 #include "_usage.h"
 #include "helpers.h"
 
 const char *applet = NULL;
 const char *extraopts = NULL;
-const char *getoptstring = "D:d:e:g:I:Kk:m:N:p:R:r:Su:1:2:3" \
+const char *getoptstring = "A:a:D:d:e:g:H:I:Kk:m:N:p:R:r:Su:1:2:3" \
 	getoptstring_COMMON;
 const struct option longopts[] = {
+	{ "healthcheck-timer",        1, NULL, 'a'},
+	{ "healthcheck-delay",        1, NULL, 'A'},
 	{ "respawn-delay",        1, NULL, 'D'},
 	{ "chdir",        1, NULL, 'd'},
 	{ "env",          1, NULL, 'e'},
@@ -91,6 +94,8 @@ const struct option longopts[] = {
 	longopts_COMMON
 };
 const char * const longopts_help[] = {
+	"set an initial health check delay",
+	"set a health check timer",
 	"Set a respawn delay",
 	"Change the PWD",
 	"Set an environment string",
@@ -113,6 +118,9 @@ const char * const longopts_help[] = {
 };
 const char *usagestring = NULL;
 
+static int healthcheckdelay = 0;
+static int healthchecktimer = 0;
+static volatile sig_atomic_t do_healthcheck = 0;
 static int nicelevel = 0;
 static int ionicec = -1;
 static int ioniced = 0;
@@ -183,6 +191,12 @@ static void handle_signal(int sig)
 		re_exec_supervisor();
 }
 
+static void healthcheck(int sig)
+{
+	if (sig == SIGALRM)
+		do_healthcheck = 1;
+}
+
 static char * expand_home(const char *home, const char *path)
 {
 	char *opath, *ppath, *p, *nh;
@@ -423,11 +437,14 @@ static void child_process(char *exec, char **argv)
 static void supervisor(char *exec, char **argv)
 {
 	FILE *fp;
+	pid_t wait_pid;
 	int i;
 	int nkilled;
 	struct timespec ts;
 	time_t respawn_now= 0;
 	time_t first_spawn= 0;
+	pid_t health_pid;
+	int health_status;
 
 #ifndef RC_DEBUG
 	signal_setup_restart(SIGHUP, handle_signal);
@@ -488,46 +505,88 @@ static void supervisor(char *exec, char **argv)
 	 * Supervisor main loop
 	 */
 	i = 0;
+	if (healthcheckdelay) {
+		signal_setup(SIGALRM, healthcheck);
+		alarm(healthcheckdelay);
+	} else if (healthchecktimer) {
+		signal_setup(SIGALRM, healthcheck);
+		alarm(healthchecktimer);
+	}
 	while (!exiting) {
-		wait(&i);
-		if (exiting) {
-			signal_setup(SIGCHLD, SIG_IGN);
-			syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
-			nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0,
-					false, false, true);
-			if (nkilled > 0)
-				syslog(LOG_INFO, "killed %d processes", nkilled);
-		} else {
-			ts.tv_sec = respawn_delay;
-			ts.tv_nsec = 0;
-			nanosleep(&ts, NULL);
-			if (respawn_max > 0 && respawn_period > 0) {
-				respawn_now = time(NULL);
-				if (first_spawn == 0)
-					first_spawn = respawn_now;
-				if (respawn_now - first_spawn > respawn_period) {
-					respawn_count = 0;
-					first_spawn = 0;
-				} else
-					respawn_count++;
-				if (respawn_count > respawn_max) {
-					syslog(LOG_WARNING,
-							"respawned \"%s\" too many times, exiting", exec);
-					exiting = true;
+		wait_pid = wait(&i);
+		if (wait_pid == -1) {
+			if (do_healthcheck) {
+				do_healthcheck = 0;
+				alarm(0);
+				syslog(LOG_DEBUG, "running health check for %s", svcname);
+				health_pid = exec_service(svcname, "healthcheck");
+				health_status = rc_waitpid(health_pid);
+				if (WIFEXITED(health_status) && !WEXITSTATUS(health_status)) {
+					alarm(healthchecktimer);
 					continue;
+				} else {
+					syslog(LOG_WARNING, "health check for %s failed", svcname);
+					health_pid = exec_service(svcname, "unhealthy");
+					rc_waitpid(health_pid);
+					syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
+					nkilled = run_stop_schedule(applet, NULL, NULL, child_pid, 0,
+							false, false, true);
+					if (nkilled > 0)
+						syslog(LOG_INFO, "killed %d processes", nkilled);
+					else if (errno != 0)
+						syslog(LOG_INFO, "Unable to kill %d: %s",
+								child_pid, strerror(errno));
 				}
+			} else if (exiting ) {
+				alarm(0);
+				syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
+				nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0,
+						false, false, true);
+				if (nkilled > 0)
+					syslog(LOG_INFO, "killed %d processes", nkilled);
+				continue;
 			}
+		} else if (wait_pid == child_pid) {
 			if (WIFEXITED(i))
 				syslog(LOG_WARNING, "%s, pid %d, exited with return code %d",
 						exec, child_pid, WEXITSTATUS(i));
 			else if (WIFSIGNALED(i))
 				syslog(LOG_WARNING, "%s, pid %d, terminated by signal %d",
 						exec, child_pid, WTERMSIG(i));
-			child_pid = fork();
-			if (child_pid == -1)
-				eerrorx("%s: fork: %s", applet, strerror(errno));
-			if (child_pid == 0)
-				child_process(exec, argv);
+		} else
+			continue;
+
+		ts.tv_sec = respawn_delay;
+		ts.tv_nsec = 0;
+		nanosleep(&ts, NULL);
+		if (respawn_max > 0 && respawn_period > 0) {
+			respawn_now = time(NULL);
+			if (first_spawn == 0)
+				first_spawn = respawn_now;
+			if (respawn_now - first_spawn > respawn_period) {
+				respawn_count = 0;
+				first_spawn = 0;
+			} else
+				respawn_count++;
+			if (respawn_count > respawn_max) {
+				syslog(LOG_WARNING,
+						"respawned \"%s\" too many times, exiting", exec);
+				exiting = true;
+				continue;
+			}
+		}
+		alarm(0);
+		child_pid = fork();
+		if (child_pid == -1)
+			eerrorx("%s: fork: %s", applet, strerror(errno));
+		if (child_pid == 0)
+			child_process(exec, argv);
+		if (healthcheckdelay) {
+			signal_setup(SIGALRM, healthcheck);
+			alarm(healthcheckdelay);
+		} else if (healthchecktimer) {
+			signal_setup(SIGALRM, healthcheck);
+			alarm(healthchecktimer);
 		}
 	}
 
@@ -612,6 +671,16 @@ int main(int argc, char **argv)
 	while ((opt = getopt_long(argc, argv, getoptstring, longopts,
 		    (int *) 0)) != -1)
 		switch (opt) {
+		case 'a':  /* --healthcheck-timer <time> */
+			if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1)
+				eerrorx("%s: invalid health check timer %s", applet, optarg);
+			break;
+
+		case 'A':  /* --healthcheck-delay <time> */
+			if (sscanf(optarg, "%d", &healthcheckdelay) != 1 || healthcheckdelay < 1)
+				eerrorx("%s: invalid health check delay %s", applet, optarg);
+			break;
+
 		case 'D':  /* --respawn-delay time */
 			n = sscanf(optarg, "%d", &respawn_delay);
 			if (n	!= 1 || respawn_delay < 1)
@@ -668,6 +737,11 @@ int main(int argc, char **argv)
 			gid = gr->gr_gid;
 			break;
 
+		case 'H':  /* --healthcheck-timer <minutes> */
+			if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1)
+				eerrorx("%s: invalid health check timer %s", applet, optarg);
+			break;
+
 		case 'k':
 			if (parse_mode(&numask, optarg))
 				eerrorx("%s: invalid mode `%s'",

diff --git a/supervise-daemon-guide.md b/supervise-daemon-guide.md
index 0b15a858..07ab55cf 100644
--- a/supervise-daemon-guide.md
+++ b/supervise-daemon-guide.md
@@ -22,6 +22,28 @@ The following is a brief guide on using this capability.
   instructs it not to fork to the command_args_foreground variable shown
   below.
 
+# Health Checks
+
+Health checks are a way to make sure a service monitored by
+supervise-daemon stays healthy. To configure a health check for a
+service, you need to write a healthcheck() function, and optionally an
+unhealthy() function in the service script. Also, you will need to set
+the healthcheck_timer and optionally healthcheck_delay variables.
+
+## healthcheck() function
+
+The healthcheck() function is run repeatedly based on the settings of
+the healthcheck_* variables. This function should return zero if the
+service is currently healthy or non-zero otherwise.
+
+## unhealthy() function
+
+If the healthcheck() function returns non-zero, the unhealthy() function
+is run, then the service is restarted. Since the service will be
+restarted by the supervisor, the unhealthy function should not try to
+restart it; the purpose of the function is to allow any cleanup tasks
+other than restarting the service to be run.
+
 # Variable Settings
 
 The most important setting is the supervisor variable. At the top of
@@ -52,6 +74,20 @@ This 	should be used if the daemon you want to monitor
 forks and goes to the background by default. This should be set to the
 command line option that instructs the daemon to stay in the foreground.
 
+``` sh
+healthcheck_delay=seconds
+```
+
+This is the delay, in seconds, before the first health check is run.
+If it is not set, we use the value of healthcheck_timer.
+
+``` sh
+healthcheck_timer=seconds
+```
+
+This is the  number of seconds between health checks. If it is not set,
+no health checks will be run.
+
 ``` sh
 respawn_delay
 ```


^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2018-10-23 22:00 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-10-23 22:00 [gentoo-commits] proj/openrc:master commit in: /, src/rc/, man/, sh/ William Hubbs

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox