From 56c47518d25bafc98f272b39f654020f512fd582 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 21 Mar 2007 18:02:03 +0000 Subject: [PATCH] Add special message for user root to restart slurmctld daemons as needed. I keep finding the backup slurmctld down on production clusters and hope this helps. --- src/scontrol/scontrol.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index 990be525474..4727a1867e3 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -339,6 +339,7 @@ _ping_slurmctld(char *control_machine, char *backup_controller) { static char *state[2] = { "UP", "DOWN" }; int primary = 1, secondary = 1; + int down_msg = 0; if (slurm_ping(1) == SLURM_SUCCESS) primary = 0; @@ -347,17 +348,27 @@ _ping_slurmctld(char *control_machine, char *backup_controller) fprintf(stdout, "Slurmctld(primary/backup) "); if (control_machine || backup_controller) { fprintf(stdout, "at "); - if (control_machine) + if (control_machine) { fprintf(stdout, "%s/", control_machine); - else + if (primary) + down_msg = 1; + } else fprintf(stdout, "(NULL)/"); - if (backup_controller) + if (backup_controller) { fprintf(stdout, "%s ", backup_controller); - else + if (secondary) + down_msg = 1; + } else fprintf(stdout, "(NULL) "); } fprintf(stdout, "are %s/%s\n", state[primary], state[secondary]); + + if (down_msg && ((getuid() == 0) || (geteuid() == 0))) { + fprintf(stdout, "*****************************************\n"); + fprintf(stdout, "** RESTORE SLURMCTLD DAEMON TO SERVICE **\n"); + fprintf(stdout, "*****************************************\n"); + } } /* -- GitLab