Skip to content
Snippets Groups Projects
Commit 56c47518 authored by Moe Jette's avatar Moe Jette
Browse files

Add special message for user root to restart slurmctld daemons as needed.

I keep finding the backup slurmctld down on production clusters and hope
this helps.
parent ae33944b
No related branches found
No related tags found
No related merge requests found
...@@ -339,6 +339,7 @@ _ping_slurmctld(char *control_machine, char *backup_controller) ...@@ -339,6 +339,7 @@ _ping_slurmctld(char *control_machine, char *backup_controller)
{ {
static char *state[2] = { "UP", "DOWN" }; static char *state[2] = { "UP", "DOWN" };
int primary = 1, secondary = 1; int primary = 1, secondary = 1;
int down_msg = 0;
if (slurm_ping(1) == SLURM_SUCCESS) if (slurm_ping(1) == SLURM_SUCCESS)
primary = 0; primary = 0;
...@@ -347,17 +348,27 @@ _ping_slurmctld(char *control_machine, char *backup_controller) ...@@ -347,17 +348,27 @@ _ping_slurmctld(char *control_machine, char *backup_controller)
fprintf(stdout, "Slurmctld(primary/backup) "); fprintf(stdout, "Slurmctld(primary/backup) ");
if (control_machine || backup_controller) { if (control_machine || backup_controller) {
fprintf(stdout, "at "); fprintf(stdout, "at ");
if (control_machine) if (control_machine) {
fprintf(stdout, "%s/", control_machine); fprintf(stdout, "%s/", control_machine);
else if (primary)
down_msg = 1;
} else
fprintf(stdout, "(NULL)/"); fprintf(stdout, "(NULL)/");
if (backup_controller) if (backup_controller) {
fprintf(stdout, "%s ", backup_controller); fprintf(stdout, "%s ", backup_controller);
else if (secondary)
down_msg = 1;
} else
fprintf(stdout, "(NULL) "); fprintf(stdout, "(NULL) ");
} }
fprintf(stdout, "are %s/%s\n", fprintf(stdout, "are %s/%s\n",
state[primary], state[secondary]); state[primary], state[secondary]);
if (down_msg && ((getuid() == 0) || (geteuid() == 0))) {
fprintf(stdout, "*****************************************\n");
fprintf(stdout, "** RESTORE SLURMCTLD DAEMON TO SERVICE **\n");
fprintf(stdout, "*****************************************\n");
}
} }
/* /*
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment