Skip to content
Snippets Groups Projects
Commit ee2ddfeb authored by Morris Jette's avatar Morris Jette
Browse files

Imrprove fault tolerance for cray power cycling

This patch adds improved checking for errors and retry logic with
  respect to changing KNL NUMA & MCDRAM modes plus power cycling
  the nodes.
bug 2658
parent eeacee53
No related branches found
No related tags found
No related merge requests found
...@@ -74,6 +74,18 @@ ...@@ -74,6 +74,18 @@
#define DEFAULT_CAPMC_TIMEOUT 10000 /* 10 seconds */ #define DEFAULT_CAPMC_TIMEOUT 10000 /* 10 seconds */
#define MIN_CAPMC_TIMEOUT 1000 /* 1 second */ #define MIN_CAPMC_TIMEOUT 1000 /* 1 second */
/* Number of times to try performing "node_off" operation */
#define NODE_OFF_RETRIES 10
/* Number of times to try performing "node_on" operation */
#define NODE_ON_RETRIES 10
/* Number of times to try performing node state change operation */
#define NODE_STATE_RETRIES 10
/* How long to wait for a node to enter "off" state, in seconds */
#define NODE_OFF_STATE_WAIT (30 * 60)
/* Static variables */ /* Static variables */
static char *capmc_path = NULL; static char *capmc_path = NULL;
static uint32_t capmc_poll_freq = 45; static uint32_t capmc_poll_freq = 45;
...@@ -215,6 +227,9 @@ static char *_run_script(char **script_argv, int *status) ...@@ -215,6 +227,9 @@ static char *_run_script(char **script_argv, int *status)
close(pfd[1]); close(pfd[1]);
error("%s: fork(): %s", prog_name, error("%s: fork(): %s", prog_name,
slurm_strerror(slurm_get_errno())); slurm_strerror(slurm_get_errno()));
*status = 127;
resp = xstrdup("System error");
return resp;
} else { } else {
struct pollfd fds; struct pollfd fds;
struct timeval tstart; struct timeval tstart;
...@@ -287,6 +302,8 @@ static bool _check_node_state(int nid, char *nid_str, char *state) ...@@ -287,6 +302,8 @@ static bool _check_node_state(int nid, char *nid_str, char *state)
if (status != 0) { if (status != 0) {
error("%s: capmc(%s,%s,%s): %d %s", prog_name, error("%s: capmc(%s,%s,%s): %d %s", prog_name,
argv[1], argv[2], argv[3], status, resp_msg); argv[1], argv[2], argv[3], status, resp_msg);
xfree(resp_msg);
return node_state_ok;
} }
j = json_tokener_parse(resp_msg); j = json_tokener_parse(resp_msg);
if (j == NULL) { if (j == NULL) {
...@@ -316,6 +333,8 @@ static void *_node_update(void *args) ...@@ -316,6 +333,8 @@ static void *_node_update(void *args)
char *argv[10], nid_str[32], *resp_msg; char *argv[10], nid_str[32], *resp_msg;
int i, nid = -1, status = 0; int i, nid = -1, status = 0;
bool node_state_ok; bool node_state_ok;
bool node_off_sent = false, node_on_sent = false, node_state_sent;
time_t poll_start;
for (i = 0; node_name[i]; i++) { for (i = 0; node_name[i]; i++) {
if ((node_name[i] >= '0') && (node_name[i] <= '9')) { if ((node_name[i] >= '0') && (node_name[i] <= '9')) {
...@@ -340,13 +359,24 @@ static void *_node_update(void *args) ...@@ -340,13 +359,24 @@ static void *_node_update(void *args)
argv[4] = "-n"; argv[4] = "-n";
argv[5] = nid_str; argv[5] = nid_str;
argv[6] = NULL; argv[6] = NULL;
resp_msg = _run_script(argv, &status); node_state_sent = false;
if (status != 0) { for (i = 0; ((i < NODE_STATE_RETRIES) && !node_state_sent);
error("%s: capmc(%s,%s,%s,%s,%s): %d %s", prog_name, i++) {
argv[1], argv[2], argv[3], argv[4], argv[5], resp_msg = _run_script(argv, &status);
status, resp_msg); if ((status != 0) ||
(resp_msg &&
(strcasestr(resp_msg, "Success") == NULL))) {
error("%s: capmc(%s,%s,%s,%s,%s): %d %s",
prog_name, argv[1], argv[2], argv[3],
argv[4], argv[5], status, resp_msg);
sleep(1);
} else {
debug("%s: set_mcdram_cfg sent to %s",
prog_name, nid_str);
node_state_sent = true;
}
xfree(resp_msg);
} }
xfree(resp_msg);
} }
if (numa_mode) { if (numa_mode) {
...@@ -359,13 +389,24 @@ static void *_node_update(void *args) ...@@ -359,13 +389,24 @@ static void *_node_update(void *args)
argv[4] = "-n"; argv[4] = "-n";
argv[5] = nid_str; argv[5] = nid_str;
argv[6] = NULL; argv[6] = NULL;
resp_msg = _run_script(argv, &status); node_state_sent = false;
if (status != 0) { for (i = 0; ((i < NODE_STATE_RETRIES) && !node_state_sent);
error("%s: capmc(%s,%s,%s,%s,%s): %d %s", prog_name, i++) {
argv[1], argv[2], argv[3], argv[4], argv[5], resp_msg = _run_script(argv, &status);
status, resp_msg); if ((status != 0) ||
(resp_msg &&
(strcasestr(resp_msg, "Success") == NULL))) {
error("%s: capmc(%s,%s,%s,%s,%s): %d %s",
prog_name, argv[1], argv[2], argv[3],
argv[4], argv[5], status, resp_msg);
sleep(1);
} else {
debug("%s: set_numa_cfg sent to %s",
prog_name, nid_str);
node_state_sent = true;
}
xfree(resp_msg);
} }
xfree(resp_msg);
} }
/* Test if already in "off" state */ /* Test if already in "off" state */
...@@ -379,20 +420,33 @@ static void *_node_update(void *args) ...@@ -379,20 +420,33 @@ static void *_node_update(void *args)
argv[2] = "-n"; argv[2] = "-n";
argv[3] = nid_str; argv[3] = nid_str;
argv[4] = NULL; argv[4] = NULL;
resp_msg = _run_script(argv, &status); for (i = 0; ((i < NODE_OFF_RETRIES) && !node_off_sent); i++) {
if (status != 0) { resp_msg = _run_script(argv, &status);
error("%s: capmc(%s,%s,%s): %d %s", prog_name, if ((status != 0) ||
argv[1], argv[2], argv[3], status, resp_msg); (resp_msg &&
(strcasestr(resp_msg, "Success") == NULL))) {
error("%s: capmc(%s,%s,%s): %d %s", prog_name,
argv[1], argv[2], argv[3], status,
resp_msg);
sleep(1);
} else {
debug("%s: node_off sent to %s",
prog_name, nid_str);
node_off_sent = true;
}
xfree(resp_msg);
} }
xfree(resp_msg);
} }
/* Wait for node in "off" state */ /* Wait for node in "off" state */
while (!node_state_ok) { poll_start = time(NULL);
while (!node_state_ok &&
(difftime(time(NULL), poll_start) < NODE_OFF_STATE_WAIT)) {
sleep(capmc_poll_freq); sleep(capmc_poll_freq);
node_state_ok = _check_node_state(nid, nid_str, "off"); node_state_ok = _check_node_state(nid, nid_str, "off");
} }
/* Request node power up. /* Request node power up.
* Example: "capmc node_on –n 43" */ * Example: "capmc node_on –n 43" */
argv[0] = "capmc"; argv[0] = "capmc";
...@@ -400,12 +454,19 @@ static void *_node_update(void *args) ...@@ -400,12 +454,19 @@ static void *_node_update(void *args)
argv[2] = "-n"; argv[2] = "-n";
argv[3] = nid_str; argv[3] = nid_str;
argv[4] = NULL; argv[4] = NULL;
resp_msg = _run_script(argv, &status); for (i = 0; ((i < NODE_ON_RETRIES) && !node_on_sent); i++) {
if (status != 0) { resp_msg = _run_script(argv, &status);
error("%s: capmc(%s,%s,%s): %d %s", prog_name, if ((status != 0) ||
argv[1], argv[2], argv[3], status, resp_msg); (resp_msg && (strcasestr(resp_msg, "Success") == NULL))) {
error("%s: capmc(%s,%s,%s): %d %s", prog_name,
argv[1], argv[2], argv[3], status, resp_msg);
sleep(1);
} else {
debug("%s: node_on sent to %s", prog_name, nid_str);
node_on_sent = true;
}
xfree(resp_msg);
} }
xfree(resp_msg);
fini: slurm_mutex_lock(&thread_cnt_mutex); fini: slurm_mutex_lock(&thread_cnt_mutex);
thread_cnt--; thread_cnt--;
......
...@@ -73,6 +73,12 @@ ...@@ -73,6 +73,12 @@
#define DEFAULT_CAPMC_TIMEOUT 10000 /* 10 seconds */ #define DEFAULT_CAPMC_TIMEOUT 10000 /* 10 seconds */
#define MIN_CAPMC_TIMEOUT 1000 /* 1 second */ #define MIN_CAPMC_TIMEOUT 1000 /* 1 second */
/* Number of times to try performing "node_off" operation */
#define NODE_OFF_RETRIES 10
/* How long to wait for a node to enter "off" state, in seconds */
#define NODE_OFF_STATE_WAIT (30 * 60)
/* Static variables */ /* Static variables */
static char *capmc_path = NULL; static char *capmc_path = NULL;
static uint32_t capmc_poll_freq = 45; /* capmc state polling frequency */ static uint32_t capmc_poll_freq = 45; /* capmc state polling frequency */
...@@ -210,6 +216,9 @@ static char *_run_script(char **script_argv, int *status) ...@@ -210,6 +216,9 @@ static char *_run_script(char **script_argv, int *status)
close(pfd[1]); close(pfd[1]);
error("%s: fork(): %s", prog_name, error("%s: fork(): %s", prog_name,
slurm_strerror(slurm_get_errno())); slurm_strerror(slurm_get_errno()));
*status = 127;
resp = xstrdup("System error");
return resp;
} else { } else {
struct pollfd fds; struct pollfd fds;
struct timeval tstart; struct timeval tstart;
...@@ -315,6 +324,8 @@ static bool _check_node_state(int nid, char *nid_str, char *state) ...@@ -315,6 +324,8 @@ static bool _check_node_state(int nid, char *nid_str, char *state)
if (status != 0) { if (status != 0) {
error("%s: capmc(%s,%s,%s): %d %s", prog_name, error("%s: capmc(%s,%s,%s): %d %s", prog_name,
argv[1], argv[2], argv[3], status, resp_msg); argv[1], argv[2], argv[3], status, resp_msg);
xfree(resp_msg);
return node_state_ok;
} }
j = json_tokener_parse(resp_msg); j = json_tokener_parse(resp_msg);
if (j == NULL) { if (j == NULL) {
...@@ -343,7 +354,8 @@ static void *_node_update(void *args) ...@@ -343,7 +354,8 @@ static void *_node_update(void *args)
char *node_name = (char *) args; char *node_name = (char *) args;
char *argv[10], nid_str[32], *resp_msg; char *argv[10], nid_str[32], *resp_msg;
int i, nid = -1, status = 0; int i, nid = -1, status = 0;
bool node_state_ok; bool node_state_ok, node_off_sent = false;
time_t poll_start;
for (i = 0; node_name[i]; i++) { for (i = 0; node_name[i]; i++) {
if ((node_name[i] >= '0') && (node_name[i] <= '9')) { if ((node_name[i] >= '0') && (node_name[i] <= '9')) {
...@@ -364,15 +376,24 @@ static void *_node_update(void *args) ...@@ -364,15 +376,24 @@ static void *_node_update(void *args)
argv[2] = "-n"; argv[2] = "-n";
argv[3] = nid_str; argv[3] = nid_str;
argv[4] = NULL; argv[4] = NULL;
resp_msg = _run_script(argv, &status); for (i = 0; ((i < NODE_OFF_RETRIES) && !node_off_sent); i++) {
if (status != 0) { resp_msg = _run_script(argv, &status);
error("%s: capmc(%s,%s,%s): %d %s", prog_name, if ((status != 0) ||
argv[1], argv[2], argv[3], status, resp_msg); (resp_msg && (strcasestr(resp_msg, "Success") == NULL))) {
error("%s: capmc(%s,%s,%s): %d %s", prog_name,
argv[1], argv[2], argv[3], status, resp_msg);
sleep(1);
} else {
debug("%s: node_off sent to %s", prog_name, nid_str);
node_off_sent = true;
}
xfree(resp_msg);
} }
xfree(resp_msg);
/* Wait for node in "off" state */ /* Wait for node in "off" state */
while (!node_state_ok) { poll_start = time(NULL);
while (!node_state_ok &&
(difftime(time(NULL), poll_start) < NODE_OFF_STATE_WAIT)) {
sleep(capmc_poll_freq); sleep(capmc_poll_freq);
node_state_ok = _check_node_state(nid, nid_str, "off"); node_state_ok = _check_node_state(nid, nid_str, "off");
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment