diff --git a/contribs/cray/capmc_resume.c b/contribs/cray/capmc_resume.c index bbc8e79b2e6bb7f2e411a3153fdf6e338d924a49..e2c7fe432bb888e557158e0176047ef01a00306a 100644 --- a/contribs/cray/capmc_resume.c +++ b/contribs/cray/capmc_resume.c @@ -74,6 +74,18 @@ #define DEFAULT_CAPMC_TIMEOUT 10000 /* 10 seconds */ #define MIN_CAPMC_TIMEOUT 1000 /* 1 second */ +/* Number of times to try performing "node_off" operation */ +#define NODE_OFF_RETRIES 10 + +/* Number of times to try performing "node_on" operation */ +#define NODE_ON_RETRIES 10 + +/* Number of times to try performing node state change operation */ +#define NODE_STATE_RETRIES 10 + +/* How long to wait for a node to enter "off" state, in seconds */ +#define NODE_OFF_STATE_WAIT (30 * 60) + /* Static variables */ static char *capmc_path = NULL; static uint32_t capmc_poll_freq = 45; @@ -215,6 +227,9 @@ static char *_run_script(char **script_argv, int *status) close(pfd[1]); error("%s: fork(): %s", prog_name, slurm_strerror(slurm_get_errno())); + *status = 127; + resp = xstrdup("System error"); + return resp; } else { struct pollfd fds; struct timeval tstart; @@ -287,6 +302,8 @@ static bool _check_node_state(int nid, char *nid_str, char *state) if (status != 0) { error("%s: capmc(%s,%s,%s): %d %s", prog_name, argv[1], argv[2], argv[3], status, resp_msg); + xfree(resp_msg); + return node_state_ok; } j = json_tokener_parse(resp_msg); if (j == NULL) { @@ -316,6 +333,8 @@ static void *_node_update(void *args) char *argv[10], nid_str[32], *resp_msg; int i, nid = -1, status = 0; bool node_state_ok; + bool node_off_sent = false, node_on_sent = false, node_state_sent; + time_t poll_start; for (i = 0; node_name[i]; i++) { if ((node_name[i] >= '0') && (node_name[i] <= '9')) { @@ -340,13 +359,24 @@ static void *_node_update(void *args) argv[4] = "-n"; argv[5] = nid_str; argv[6] = NULL; - resp_msg = _run_script(argv, &status); - if (status != 0) { - error("%s: capmc(%s,%s,%s,%s,%s): %d %s", prog_name, - argv[1], argv[2], argv[3], argv[4], argv[5], - status, resp_msg); + node_state_sent = false; + for (i = 0; ((i < NODE_STATE_RETRIES) && !node_state_sent); + i++) { + resp_msg = _run_script(argv, &status); + if ((status != 0) || + (resp_msg && + (strcasestr(resp_msg, "Success") == NULL))) { + error("%s: capmc(%s,%s,%s,%s,%s): %d %s", + prog_name, argv[1], argv[2], argv[3], + argv[4], argv[5], status, resp_msg); + sleep(1); + } else { + debug("%s: set_mcdram_cfg sent to %s", + prog_name, nid_str); + node_state_sent = true; + } + xfree(resp_msg); } - xfree(resp_msg); } if (numa_mode) { @@ -359,13 +389,24 @@ static void *_node_update(void *args) argv[4] = "-n"; argv[5] = nid_str; argv[6] = NULL; - resp_msg = _run_script(argv, &status); - if (status != 0) { - error("%s: capmc(%s,%s,%s,%s,%s): %d %s", prog_name, - argv[1], argv[2], argv[3], argv[4], argv[5], - status, resp_msg); + node_state_sent = false; + for (i = 0; ((i < NODE_STATE_RETRIES) && !node_state_sent); + i++) { + resp_msg = _run_script(argv, &status); + if ((status != 0) || + (resp_msg && + (strcasestr(resp_msg, "Success") == NULL))) { + error("%s: capmc(%s,%s,%s,%s,%s): %d %s", + prog_name, argv[1], argv[2], argv[3], + argv[4], argv[5], status, resp_msg); + sleep(1); + } else { + debug("%s: set_numa_cfg sent to %s", + prog_name, nid_str); + node_state_sent = true; + } + xfree(resp_msg); } - xfree(resp_msg); } /* Test if already in "off" state */ @@ -379,20 +420,33 @@ static void *_node_update(void *args) argv[2] = "-n"; argv[3] = nid_str; argv[4] = NULL; - resp_msg = _run_script(argv, &status); - if (status != 0) { - error("%s: capmc(%s,%s,%s): %d %s", prog_name, - argv[1], argv[2], argv[3], status, resp_msg); + for (i = 0; ((i < NODE_OFF_RETRIES) && !node_off_sent); i++) { + resp_msg = _run_script(argv, &status); + if ((status != 0) || + (resp_msg && + (strcasestr(resp_msg, "Success") == NULL))) { + error("%s: capmc(%s,%s,%s): %d %s", prog_name, + argv[1], argv[2], argv[3], status, + resp_msg); + sleep(1); + } else { + debug("%s: node_off sent to %s", + prog_name, nid_str); + node_off_sent = true; + } + xfree(resp_msg); } - xfree(resp_msg); } /* Wait for node in "off" state */ - while (!node_state_ok) { + poll_start = time(NULL); + while (!node_state_ok && + (difftime(time(NULL), poll_start) < NODE_OFF_STATE_WAIT)) { sleep(capmc_poll_freq); node_state_ok = _check_node_state(nid, nid_str, "off"); } + /* Request node power up. * Example: "capmc node_on –n 43" */ argv[0] = "capmc"; @@ -400,12 +454,19 @@ static void *_node_update(void *args) argv[2] = "-n"; argv[3] = nid_str; argv[4] = NULL; - resp_msg = _run_script(argv, &status); - if (status != 0) { - error("%s: capmc(%s,%s,%s): %d %s", prog_name, - argv[1], argv[2], argv[3], status, resp_msg); + for (i = 0; ((i < NODE_ON_RETRIES) && !node_on_sent); i++) { + resp_msg = _run_script(argv, &status); + if ((status != 0) || + (resp_msg && (strcasestr(resp_msg, "Success") == NULL))) { + error("%s: capmc(%s,%s,%s): %d %s", prog_name, + argv[1], argv[2], argv[3], status, resp_msg); + sleep(1); + } else { + debug("%s: node_on sent to %s", prog_name, nid_str); + node_on_sent = true; + } + xfree(resp_msg); } - xfree(resp_msg); fini: slurm_mutex_lock(&thread_cnt_mutex); thread_cnt--; diff --git a/contribs/cray/capmc_suspend.c b/contribs/cray/capmc_suspend.c index 772a5d5aa45fb17f9e5e16139650edd2aad4dafb..fbeebc0ccf81a734f75c9556c6f1cfdb3c4be4cc 100644 --- a/contribs/cray/capmc_suspend.c +++ b/contribs/cray/capmc_suspend.c @@ -73,6 +73,12 @@ #define DEFAULT_CAPMC_TIMEOUT 10000 /* 10 seconds */ #define MIN_CAPMC_TIMEOUT 1000 /* 1 second */ +/* Number of times to try performing "node_off" operation */ +#define NODE_OFF_RETRIES 10 + +/* How long to wait for a node to enter "off" state, in seconds */ +#define NODE_OFF_STATE_WAIT (30 * 60) + /* Static variables */ static char *capmc_path = NULL; static uint32_t capmc_poll_freq = 45; /* capmc state polling frequency */ @@ -210,6 +216,9 @@ static char *_run_script(char **script_argv, int *status) close(pfd[1]); error("%s: fork(): %s", prog_name, slurm_strerror(slurm_get_errno())); + *status = 127; + resp = xstrdup("System error"); + return resp; } else { struct pollfd fds; struct timeval tstart; @@ -315,6 +324,8 @@ static bool _check_node_state(int nid, char *nid_str, char *state) if (status != 0) { error("%s: capmc(%s,%s,%s): %d %s", prog_name, argv[1], argv[2], argv[3], status, resp_msg); + xfree(resp_msg); + return node_state_ok; } j = json_tokener_parse(resp_msg); if (j == NULL) { @@ -343,7 +354,8 @@ static void *_node_update(void *args) char *node_name = (char *) args; char *argv[10], nid_str[32], *resp_msg; int i, nid = -1, status = 0; - bool node_state_ok; + bool node_state_ok, node_off_sent = false; + time_t poll_start; for (i = 0; node_name[i]; i++) { if ((node_name[i] >= '0') && (node_name[i] <= '9')) { @@ -364,15 +376,24 @@ static void *_node_update(void *args) argv[2] = "-n"; argv[3] = nid_str; argv[4] = NULL; - resp_msg = _run_script(argv, &status); - if (status != 0) { - error("%s: capmc(%s,%s,%s): %d %s", prog_name, - argv[1], argv[2], argv[3], status, resp_msg); + for (i = 0; ((i < NODE_OFF_RETRIES) && !node_off_sent); i++) { + resp_msg = _run_script(argv, &status); + if ((status != 0) || + (resp_msg && (strcasestr(resp_msg, "Success") == NULL))) { + error("%s: capmc(%s,%s,%s): %d %s", prog_name, + argv[1], argv[2], argv[3], status, resp_msg); + sleep(1); + } else { + debug("%s: node_off sent to %s", prog_name, nid_str); + node_off_sent = true; + } + xfree(resp_msg); } - xfree(resp_msg); /* Wait for node in "off" state */ - while (!node_state_ok) { + poll_start = time(NULL); + while (!node_state_ok && + (difftime(time(NULL), poll_start) < NODE_OFF_STATE_WAIT)) { sleep(capmc_poll_freq); node_state_ok = _check_node_state(nid, nid_str, "off"); }