From 5cb90497ca3af1bfb2969d4bff1b82264f10da6c Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Tue, 4 Oct 2016 09:58:07 -0600 Subject: [PATCH] add knl.conf parameter CapmcRetries Add new knl.conf configuration parameter CapmcRetries Modify capmc_suspend and capmc_resume to retry operations when Cray State Manager is down. Add retry logic to node_features/knl_cray to handle Cray State manager being down. bug 3100 --- NEWS | 3 + contribs/cray/capmc_resume.c | 98 +++++++--- contribs/cray/capmc_suspend.c | 52 ++--- doc/man/man5/knl.conf.5 | 5 + slurm/slurm.h.in | 13 +- .../knl_cray/node_features_knl_cray.c | 183 ++++++++++++------ 6 files changed, 241 insertions(+), 113 deletions(-) diff --git a/NEWS b/NEWS index c1cc109f022..3c0172c1c51 100644 --- a/NEWS +++ b/NEWS @@ -8,6 +8,9 @@ documents those changes that are of interest to users and administrators. -- SlurmDB - include pending jobs in search during 'sacctmgr show runawayjobs'. -- Add client side out-of-range checks to --nice flag. -- Fix support for sbatch "-W" option, previously eeded to use "--wait". + -- node_features/knl_cray plugin and capmc_suspend/resume programs modified to + sleep and retry capmc operations if the Cray State Manager is down. Added + CapmcRetries configuration parameter to knl_cray.conf. * Changes in Slurm 16.05.5 ========================== diff --git a/contribs/cray/capmc_resume.c b/contribs/cray/capmc_resume.c index 1f2ca4bc7e5..b743192a830 100644 --- a/contribs/cray/capmc_resume.c +++ b/contribs/cray/capmc_resume.c @@ -71,6 +71,7 @@ #define MAX_POLL_WAIT 500 /* Default and minimum timeout parameters for the capmc command */ +#define DEFAULT_CAPMC_RETRIES 4 #define DEFAULT_CAPMC_TIMEOUT 60000 /* 60 seconds */ #define MIN_CAPMC_TIMEOUT 1000 /* 1 second */ @@ -83,6 +84,7 @@ /* Static variables */ static char *capmc_path = NULL; static uint32_t capmc_poll_freq = 45; +static uint32_t capmc_retries = DEFAULT_CAPMC_RETRIES; static uint32_t capmc_timeout = DEFAULT_CAPMC_TIMEOUT; static char *log_file = NULL; static bitstr_t *node_bitmap = NULL; @@ -98,6 +100,7 @@ static s_p_options_t knl_conf_file_options[] = { {"AllowUserBoot", S_P_STRING}, {"CapmcPath", S_P_STRING}, {"CapmcPollFreq", S_P_UINT32}, + {"CapmcRetries", S_P_UINT32}, {"CapmcTimeout", S_P_UINT32}, {"CnselectPath", S_P_STRING}, {"DefaultMCDRAM", S_P_STRING}, @@ -146,6 +149,7 @@ static void _read_config(void) if ((tbl = _config_make_tbl(knl_conf_file))) { (void) s_p_get_string(&capmc_path, "CapmcPath", tbl); (void) s_p_get_uint32(&capmc_poll_freq, "CapmcPollFreq", tbl); + (void) s_p_get_uint32(&capmc_retries, "CapmcRetries", tbl); (void) s_p_get_uint32(&capmc_timeout, "CapmcTimeout", tbl); (void) s_p_get_string(&log_file, "LogFile", tbl); (void) s_p_get_string(&syscfg_path, "SyscfgPath", tbl); @@ -319,7 +323,7 @@ static char *_node_names_2_nid_list(char *node_names) static int _update_all_nodes(char *host_list) { char *argv[10], *nid_list, *resp_msg; - int rc = 0, status = 0; + int rc = 0, retry, status = 0; nid_list = _node_names_2_nid_list(host_list); @@ -333,18 +337,30 @@ static int _update_all_nodes(char *host_list) argv[4] = "-n"; argv[5] = nid_list; argv[6] = NULL; - resp_msg = _run_script(argv, &status); - if ((status == 0) || - (resp_msg && strcasestr(resp_msg, "Success"))) { - debug("%s: set_mcdram_cfg sent to %s", - prog_name, argv[5]); - } else { + for (retry = 0; ; retry++) { + resp_msg = _run_script(argv, &status); + if ((status == 0) || + (resp_msg && strcasestr(resp_msg, "Success"))) { + debug("%s: set_mcdram_cfg sent to %s", + prog_name, argv[5]); + xfree(resp_msg); + break; + } error("%s: capmc(%s,%s,%s,%s,%s): %d %s", prog_name, argv[1], argv[2], argv[3], argv[4], argv[5], status, resp_msg); - rc = -1; + if (resp_msg && strstr(resp_msg, "Could not lookup") && + (retry <= capmc_retries)) { + /* State Manager is down. Sleep and retry */ + sleep(1); + xfree(resp_msg); + } else { + /* Non-recoverable error */ + rc = -1; + xfree(resp_msg); + break; + } } - xfree(resp_msg); } if (numa_mode && (rc == 0)) { @@ -357,18 +373,30 @@ static int _update_all_nodes(char *host_list) argv[4] = "-n"; argv[5] = nid_list; argv[6] = NULL; - resp_msg = _run_script(argv, &status); - if ((status == 0) || - (resp_msg && strcasestr(resp_msg, "Success"))) { - debug("%s: set_numa_cfg sent to %s", - prog_name, argv[5]); - } else { + for (retry = 0; ; retry++) { + resp_msg = _run_script(argv, &status); + if ((status == 0) || + (resp_msg && strcasestr(resp_msg, "Success"))) { + debug("%s: set_numa_cfg sent to %s", + prog_name, argv[5]); + xfree(resp_msg); + break; + } error("%s: capmc(%s,%s,%s,%s,%s): %d %s", prog_name, argv[1], argv[2], argv[3], argv[4], argv[5], status, resp_msg); - rc = -1; + if (resp_msg && strstr(resp_msg, "Could not lookup") && + (retry <= capmc_retries)) { + /* State Manager is down. Sleep and retry */ + sleep(1); + xfree(resp_msg); + } else { + /* Non-recoverable error */ + rc = -1; + xfree(resp_msg); + break; + } } - xfree(resp_msg); } /* Request node restart. @@ -381,17 +409,29 @@ static int _update_all_nodes(char *host_list) argv[4] = NULL; // argv[4] = "-r"; /* Future option: Reason */ // argv[5] = "Change KNL mode"; - resp_msg = _run_script(argv, &status); - if ((status == 0) || - (resp_msg && strcasestr(resp_msg, "Success"))) { - debug("%s: node_reinit sent to %s", - prog_name, argv[3]); - } else { + for (retry = 0; ; retry++) { + resp_msg = _run_script(argv, &status); + if ((status == 0) || + (resp_msg && strcasestr(resp_msg, "Success"))) { + debug("%s: node_reinit sent to %s", + prog_name, argv[3]); + xfree(resp_msg); + break; + } error("%s: capmc(%s,%s,%s): %d %s", prog_name, argv[1], argv[2], argv[3], status, resp_msg); - rc = -1; + if (resp_msg && strstr(resp_msg, "Could not lookup") && + (retry <= capmc_retries)) { + /* State Manager is down. Sleep and retry */ + sleep(1); + xfree(resp_msg); + } else { + /* Non-recoverable error */ + rc = -1; + xfree(resp_msg); + break; + } } - xfree(resp_msg); } xfree(nid_list); @@ -483,7 +523,9 @@ int main(int argc, char *argv[]) log_opts.stderr_level = LOG_LEVEL_QUIET; log_opts.syslog_level = LOG_LEVEL_QUIET; if (slurm_get_debug_flags() && DEBUG_FLAG_NODE_FEATURES) - log_opts.logfile_level += 3; + log_opts.logfile_level = LOG_LEVEL_DEBUG; + else + log_opts.logfile_level = LOG_LEVEL_ERROR; (void) log_init(argv[0], log_opts, LOG_DAEMON, log_file); if ((argc < 2) || (argc > 3)) { @@ -521,13 +563,13 @@ int main(int argc, char *argv[]) node_bitmap = bit_alloc(100000); if (_update_all_nodes(argv[1]) != 0) { /* Could not reboot nodes. - * Requeue/hold the job we were trying to start */ + * Requeue the job we were trying to start */ uint32_t job_id = 0; char *job_id_str = getenv("SLURM_JOB_ID"); if (job_id_str) job_id = strtol(job_id_str, NULL, 10); if (job_id) - (void) slurm_requeue(job_id, JOB_REQUEUE_HOLD); + (void) slurm_requeue(job_id, JOB_RECONFIG_FAIL); /* Return the nodes to service */ slurm_init_update_node_msg(&node_msg); diff --git a/contribs/cray/capmc_suspend.c b/contribs/cray/capmc_suspend.c index 8ee99e680db..1091275d8a8 100644 --- a/contribs/cray/capmc_suspend.c +++ b/contribs/cray/capmc_suspend.c @@ -71,6 +71,7 @@ #define MAX_POLL_WAIT 500 /* Default and minimum timeout parameters for the capmc command */ +#define DEFAULT_CAPMC_RETRIES 4 #define DEFAULT_CAPMC_TIMEOUT 60000 /* 60 seconds */ #define MIN_CAPMC_TIMEOUT 1000 /* 1 second */ @@ -83,14 +84,11 @@ /* Static variables */ static char *capmc_path = NULL; static uint32_t capmc_poll_freq = 45; /* capmc state polling frequency */ +static uint32_t capmc_retries = DEFAULT_CAPMC_RETRIES; static uint32_t capmc_timeout = DEFAULT_CAPMC_TIMEOUT; static char *log_file = NULL; static char *prog_name = NULL; -static pthread_mutex_t thread_cnt_mutex = PTHREAD_MUTEX_INITIALIZER; -static pthread_cond_t thread_cnt_cond = PTHREAD_COND_INITIALIZER; -static int thread_cnt = 0; - /* NOTE: Keep this table synchronized with the table in * src/plugins/node_features/knl_cray/node_features_knl_cray.c */ static s_p_options_t knl_conf_file_options[] = { @@ -99,6 +97,7 @@ static s_p_options_t knl_conf_file_options[] = { {"AllowUserBoot", S_P_STRING}, {"CapmcPath", S_P_STRING}, {"CapmcPollFreq", S_P_UINT32}, + {"CapmcRetries", S_P_UINT32}, {"CapmcTimeout", S_P_UINT32}, {"CnselectPath", S_P_STRING}, {"DefaultMCDRAM", S_P_STRING}, @@ -146,6 +145,7 @@ static void _read_config(void) if ((tbl = _config_make_tbl(knl_conf_file))) { (void) s_p_get_string(&capmc_path, "CapmcPath", tbl); (void) s_p_get_uint32(&capmc_poll_freq, "CapmcPollFreq", tbl); + (void) s_p_get_uint32(&capmc_retries, "CapmcRetries", tbl); (void) s_p_get_uint32(&capmc_timeout, "CapmcTimeout", tbl); (void) s_p_get_string(&log_file, "LogFile", tbl); } @@ -321,7 +321,7 @@ static char *_node_names_2_nid_list(char *node_names) static int _update_all_nodes(char *node_names) { char *argv[10], *nid_list, *resp_msg; - int rc = -1, status = 0; + int rc = 0, retry, status = 0; nid_list = _node_names_2_nid_list(node_names); if (nid_list == NULL) @@ -334,15 +334,31 @@ static int _update_all_nodes(char *node_names) argv[2] = "-n"; argv[3] = nid_list; argv[4] = NULL; - resp_msg = _run_script(argv, &status); - if ((status == 0) || - (resp_msg && strcasestr(resp_msg, "Success"))) { - debug("%s: node_off sent to %s", prog_name, argv[3]); - rc = 0; - } else { + for (retry = 0; ; retry++) { + resp_msg = _run_script(argv, &status); + if ((status == 0) || + (resp_msg && strcasestr(resp_msg, "Success"))) { + debug("%s: node_off sent to %s", prog_name, argv[3]); + xfree(resp_msg); + break; + } error("%s: capmc(%s,%s,%s): %d %s", prog_name, argv[1], argv[2], argv[3], status, resp_msg); + if (resp_msg && strstr(resp_msg, "Could not lookup") && + (retry <= capmc_retries)) { + /* State Manager is down. Sleep and retry */ + error("Cray State Manager is down, retrying request"); + sleep(1); + xfree(resp_msg); + } else { + /* Non-recoverable error */ + error("Aborting capmc_suspend for %s", nid_list); + rc = -1; + xfree(resp_msg); + break; + } } + xfree(resp_msg); xfree(nid_list); return rc; @@ -357,23 +373,15 @@ int main(int argc, char *argv[]) log_opts.stderr_level = LOG_LEVEL_QUIET; log_opts.syslog_level = LOG_LEVEL_QUIET; if (slurm_get_debug_flags() && DEBUG_FLAG_NODE_FEATURES) - log_opts.logfile_level += 3; + log_opts.logfile_level = LOG_LEVEL_DEBUG; + else + log_opts.logfile_level = LOG_LEVEL_ERROR; (void) log_init(argv[0], log_opts, LOG_DAEMON, log_file); /* Attempt to shutdown all nodes in a single capmc call. */ if (_update_all_nodes(argv[1]) != 0) exit(1); - /* Wait for work threads to complete */ - slurm_mutex_lock(&thread_cnt_mutex); - while (1) { - if (thread_cnt == 0) - break; - else /* wait for state change and retry */ - pthread_cond_wait(&thread_cnt_cond, &thread_cnt_mutex); - } - slurm_mutex_unlock(&thread_cnt_mutex); - xfree(prog_name); exit(0); } diff --git a/doc/man/man5/knl.conf.5 b/doc/man/man5/knl.conf.5 index bc32ca9853b..5489bb8e564 100644 --- a/doc/man/man5/knl.conf.5 +++ b/doc/man/man5/knl.conf.5 @@ -92,6 +92,11 @@ changes, in seconds. The default value is 45 seconds. This parameter is used only by the "knl_cray" plugin. +.TP +\fBCapmcRetries\fR +Number of times to retry failed operations of the \fBcapmc\fR program. +Default value is 4. + .TP \fBCapmcTimeout\fR Time limit for the \fBcapmc\fR program to return status information milliseconds. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 548d205e8f9..8126c9fe77b 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -240,6 +240,9 @@ enum job_states { #define JOB_COMPLETING 0x00008000 /* Waiting for epilog completion */ #define JOB_STOPPED 0x00010000 /* Job is stopped state (holding resources, but sent SIGSTOP */ +#define JOB_RECONFIG_FAIL 0x00020000 /* Node configuration for job failed, + not job state, just job requeue flag */ + #define READY_JOB_FATAL -2 /* fatal error */ #define READY_JOB_ERROR -1 /* ordinary error */ @@ -4461,9 +4464,10 @@ extern void slurm_free_job_array_resp(job_array_resp_msg_t *resp); * valid values are: * 0 - if the job has to be requeued in JOB_PENDING state * JOB_SPECIAL_EXIT - if the job has to be requeued in - * the special exit state and be held. + * the special exit state and be held. * JOB_REQUEUE_HOLD - if the job has to be requeued in - * JOB_PENDING and held state. + * JOB_PENDING and held state. + * JOB_RECONFIG_FAIL - Node configuration for job failed * RET 0 or a slurm error code */ extern int slurm_requeue(uint32_t job_id, uint32_t state); @@ -4477,9 +4481,10 @@ extern int slurm_requeue(uint32_t job_id, uint32_t state); * valid values are: * 0 - if the job has to be requeued in JOB_PENDING state * JOB_SPECIAL_EXIT - if the job has to be requeued in - * the special exit state and be held. + * the special exit state and be held. * JOB_REQUEUE_HOLD - if the job has to be requeued in - * JOB_PENDING and held state. + * JOB_PENDING and held state. + * JOB_RECONFIG_FAIL - Node configuration for job failed * OUT resp - per task response to the request, * free using slurm_free_job_array_resp() * RET 0 or a slurm error code diff --git a/src/plugins/node_features/knl_cray/node_features_knl_cray.c b/src/plugins/node_features/knl_cray/node_features_knl_cray.c index 956ba8b581a..289758f9439 100644 --- a/src/plugins/node_features/knl_cray/node_features_knl_cray.c +++ b/src/plugins/node_features/knl_cray/node_features_knl_cray.c @@ -84,6 +84,7 @@ #define MAX_POLL_WAIT 500 /* Default and minimum timeout parameters for the capmc command */ +#define DEFAULT_CAPMC_RETRIES 4 #define DEFAULT_CAPMC_TIMEOUT 60000 /* 60 seconds */ #define MIN_CAPMC_TIMEOUT 1000 /* 1 second */ @@ -154,6 +155,7 @@ List active_feature_list; /* Configuration Paramters */ static char *capmc_path = NULL; static uint32_t capmc_poll_freq = 45; /* capmc state polling frequency */ +static uint32_t capmc_retries = DEFAULT_CAPMC_RETRIES; static uint32_t capmc_timeout = 0; /* capmc command timeout in msec */ static char *cnselect_path = NULL; static bool debug_flag = false; @@ -180,6 +182,7 @@ static s_p_options_t knl_conf_file_options[] = { {"AllowUserBoot", S_P_STRING}, {"CapmcPath", S_P_STRING}, {"CapmcPollFreq", S_P_UINT32}, + {"CapmcRetries", S_P_UINT32}, {"CapmcTimeout", S_P_UINT32}, {"CnselectPath", S_P_STRING}, {"DefaultMCDRAM", S_P_STRING}, @@ -1522,6 +1525,7 @@ extern int init(void) } (void) s_p_get_string(&capmc_path, "CapmcPath", tbl); (void) s_p_get_uint32(&capmc_poll_freq, "CapmcPollFreq", tbl); + (void) s_p_get_uint32(&capmc_retries, "CapmcRetries", tbl); (void) s_p_get_uint32(&capmc_timeout, "CapmcTimeout", tbl); (void) s_p_get_string(&cnselect_path, "CnselectPath", tbl); if (s_p_get_string(&tmp_str, "DefaultMCDRAM", tbl)) { @@ -1568,6 +1572,7 @@ extern int init(void) info("AllowUserBoot=%s", allow_user_str); info("CapmcPath=%s", capmc_path); info("CapmcPollFreq=%u sec", capmc_poll_freq); + info("CapmcRetries=%u", capmc_retries); info("CapmcTimeout=%u msec", capmc_timeout); info("CnselectPath=%s", cnselect_path); info("DefaultMCDRAM=%s DefaultNUMA=%s", @@ -1613,7 +1618,7 @@ extern int node_features_p_get_node(char *node_list) { json_object *j; json_object_iter iter; - int i, k, status = 0, rc = SLURM_SUCCESS; + int i, k, rc = SLURM_SUCCESS, retry, status = 0; DEF_TIMERS; char *resp_msg, **script_argv; mcdram_cap_t *mcdram_cap = NULL; @@ -1643,25 +1648,39 @@ extern int node_features_p_get_node(char *node_list) script_argv = xmalloc(sizeof(char *) * 4); /* NULL terminated */ script_argv[0] = xstrdup("capmc"); script_argv[1] = xstrdup("get_mcdram_capabilities"); - START_TIMER; - resp_msg = _run_script(capmc_path, script_argv, &status); - END_TIMER; - if (debug_flag) { - info("%s: get_mcdram_capabilities ran for %s", - __func__, TIME_STR); - } - _log_script_argv(script_argv, resp_msg); - _free_script_argv(script_argv); - if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) { + for (retry = 0; ; retry++) { + START_TIMER; + resp_msg = _run_script(capmc_path, script_argv, &status); + END_TIMER; + if (debug_flag) { + info("%s: get_mcdram_capabilities ran for %s", + __func__, TIME_STR); + } + _log_script_argv(script_argv, resp_msg); + if (WIFEXITED(status) && (WEXITSTATUS(status) == 0)) + break; /* Success */ error("%s: get_mcdram_capabilities status:%u response:%s", __func__, status, resp_msg); + if (resp_msg == NULL) { + info("%s: get_mcdram_capabilities returned no information", + __func__); + _free_script_argv(script_argv); + rc = SLURM_ERROR; + goto fini; + } + if (strstr(resp_msg, "Could not lookup") && + (retry <= capmc_retries)) { + /* State Manager is down. Sleep and retry */ + sleep(1); + xfree(resp_msg); + } else { + xfree(resp_msg); + _free_script_argv(script_argv); + rc = SLURM_ERROR; + goto fini; + } } - if (resp_msg == NULL) { - info("%s: get_mcdram_capabilities returned no information", - __func__); - rc = SLURM_ERROR; - goto fini; - } + _free_script_argv(script_argv); j = json_tokener_parse(resp_msg); if (j == NULL) { @@ -1686,22 +1705,39 @@ extern int node_features_p_get_node(char *node_list) script_argv = xmalloc(sizeof(char *) * 4); /* NULL terminated */ script_argv[0] = xstrdup("capmc"); script_argv[1] = xstrdup("get_mcdram_cfg"); - START_TIMER; - resp_msg = _run_script(capmc_path, script_argv, &status); - END_TIMER; - if (debug_flag) - info("%s: get_mcdram_cfg ran for %s", __func__, TIME_STR); - _log_script_argv(script_argv, resp_msg); - _free_script_argv(script_argv); - if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) { + for (retry = 0; ; retry++) { + START_TIMER; + resp_msg = _run_script(capmc_path, script_argv, &status); + END_TIMER; + if (debug_flag) { + info("%s: get_mcdram_cfg ran for %s", + __func__, TIME_STR); + } + _log_script_argv(script_argv, resp_msg); + if (WIFEXITED(status) && (WEXITSTATUS(status) == 0)) + break; /* Success */ error("%s: get_mcdram_cfg status:%u response:%s", __func__, status, resp_msg); + if (resp_msg == NULL) { + info("%s: get_mcdram_cfg returned no information", + __func__); + _free_script_argv(script_argv); + rc = SLURM_ERROR; + goto fini; + } + if (strstr(resp_msg, "Could not lookup") && + (retry <= capmc_retries)) { + /* State Manager is down. Sleep and retry */ + sleep(1); + xfree(resp_msg); + } else { + xfree(resp_msg); + _free_script_argv(script_argv); + rc = SLURM_ERROR; + goto fini; + } } - if (resp_msg == NULL) { - info("%s: get_mcdram_cfg returned no information", __func__); - rc = SLURM_ERROR; - goto fini; - } + _free_script_argv(script_argv); j = json_tokener_parse(resp_msg); if (j == NULL) { @@ -1728,25 +1764,39 @@ extern int node_features_p_get_node(char *node_list) script_argv = xmalloc(sizeof(char *) * 4); /* NULL terminated */ script_argv[0] = xstrdup("capmc"); script_argv[1] = xstrdup("get_numa_capabilities"); - START_TIMER; - resp_msg = _run_script(capmc_path, script_argv, &status); - END_TIMER; - if (debug_flag) { - info("%s: get_numa_capabilities ran for %s", - __func__, TIME_STR); - } - _log_script_argv(script_argv, resp_msg); - _free_script_argv(script_argv); - if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) { + for (retry = 0; ; retry++) { + START_TIMER; + resp_msg = _run_script(capmc_path, script_argv, &status); + END_TIMER; + if (debug_flag) { + info("%s: get_numa_capabilities ran for %s", + __func__, TIME_STR); + } + _log_script_argv(script_argv, resp_msg); + if (WIFEXITED(status) && (WEXITSTATUS(status) == 0)) + break; /* Success */ error("%s: get_numa_capabilities status:%u response:%s", __func__, status, resp_msg); + if (resp_msg == NULL) { + info("%s: get_numa_capabilities returned no information", + __func__); + _free_script_argv(script_argv); + rc = SLURM_ERROR; + goto fini; + } + if (strstr(resp_msg, "Could not lookup") && + (retry <= capmc_retries)) { + /* State Manager is down. Sleep and retry */ + sleep(1); + xfree(resp_msg); + } else { + xfree(resp_msg); + _free_script_argv(script_argv); + rc = SLURM_ERROR; + goto fini; + } } - if (resp_msg == NULL) { - info("%s: get_numa_capabilities returned no information", - __func__); - rc = SLURM_ERROR; - goto fini; - } + _free_script_argv(script_argv); j = json_tokener_parse(resp_msg); if (j == NULL) { @@ -1771,22 +1821,37 @@ extern int node_features_p_get_node(char *node_list) script_argv = xmalloc(sizeof(char *) * 4); /* NULL terminated */ script_argv[0] = xstrdup("capmc"); script_argv[1] = xstrdup("get_numa_cfg"); - START_TIMER; - resp_msg = _run_script(capmc_path, script_argv, &status); - END_TIMER; - if (debug_flag) - info("%s: get_numa_cfg ran for %s", __func__, TIME_STR); - _log_script_argv(script_argv, resp_msg); - _free_script_argv(script_argv); - if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) { + for (retry = 0; ; retry++) { + START_TIMER; + resp_msg = _run_script(capmc_path, script_argv, &status); + END_TIMER; + if (debug_flag) + info("%s: get_numa_cfg ran for %s", __func__, TIME_STR); + _log_script_argv(script_argv, resp_msg); + if (WIFEXITED(status) && (WEXITSTATUS(status) == 0)) + break; /* Success */ error("%s: get_numa_cfg status:%u response:%s", __func__, status, resp_msg); + if (resp_msg == NULL) { + info("%s: get_numa_cfg returned no information", + __func__); + _free_script_argv(script_argv); + rc = SLURM_ERROR; + goto fini; + } + if (strstr(resp_msg, "Could not lookup") && + (retry <= capmc_retries)) { + /* State Manager is down. Sleep and retry */ + sleep(1); + xfree(resp_msg); + } else { + xfree(resp_msg); + _free_script_argv(script_argv); + rc = SLURM_ERROR; + goto fini; + } } - if (resp_msg == NULL) { - info("%s: get_numa_cfg returned no information", __func__); - rc = SLURM_ERROR; - goto fini; - } + _free_script_argv(script_argv); j = json_tokener_parse(resp_msg); if (j == NULL) { -- GitLab