From 5cb90497ca3af1bfb2969d4bff1b82264f10da6c Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Tue, 4 Oct 2016 09:58:07 -0600
Subject: [PATCH] add knl.conf parameter CapmcRetries

Add new knl.conf configuration parameter CapmcRetries
Modify capmc_suspend and capmc_resume to retry operations when
  Cray State Manager is down.
Add retry logic to node_features/knl_cray to handle Cray State
  manager being down.
bug 3100
---
 NEWS                                          |   3 +
 contribs/cray/capmc_resume.c                  |  98 +++++++---
 contribs/cray/capmc_suspend.c                 |  52 ++---
 doc/man/man5/knl.conf.5                       |   5 +
 slurm/slurm.h.in                              |  13 +-
 .../knl_cray/node_features_knl_cray.c         | 183 ++++++++++++------
 6 files changed, 241 insertions(+), 113 deletions(-)

diff --git a/NEWS b/NEWS
index c1cc109f022..3c0172c1c51 100644
--- a/NEWS
+++ b/NEWS
@@ -8,6 +8,9 @@ documents those changes that are of interest to users and administrators.
  -- SlurmDB - include pending jobs in search during 'sacctmgr show runawayjobs'.
  -- Add client side out-of-range checks to --nice flag.
  -- Fix support for sbatch "-W" option, previously eeded to use "--wait".
+ -- node_features/knl_cray plugin and capmc_suspend/resume programs modified to
+    sleep and retry capmc operations if the Cray State Manager is down. Added
+    CapmcRetries configuration parameter to knl_cray.conf.
 
 * Changes in Slurm 16.05.5
 ==========================
diff --git a/contribs/cray/capmc_resume.c b/contribs/cray/capmc_resume.c
index 1f2ca4bc7e5..b743192a830 100644
--- a/contribs/cray/capmc_resume.c
+++ b/contribs/cray/capmc_resume.c
@@ -71,6 +71,7 @@
 #define MAX_POLL_WAIT 500
 
 /* Default and minimum timeout parameters for the capmc command */
+#define DEFAULT_CAPMC_RETRIES 4
 #define DEFAULT_CAPMC_TIMEOUT 60000	/* 60 seconds */
 #define MIN_CAPMC_TIMEOUT 1000		/* 1 second */
 
@@ -83,6 +84,7 @@
 /* Static variables */
 static char *capmc_path = NULL;
 static uint32_t capmc_poll_freq = 45;
+static uint32_t capmc_retries = DEFAULT_CAPMC_RETRIES;
 static uint32_t capmc_timeout = DEFAULT_CAPMC_TIMEOUT;
 static char *log_file = NULL;
 static bitstr_t *node_bitmap = NULL;
@@ -98,6 +100,7 @@ static s_p_options_t knl_conf_file_options[] = {
 	{"AllowUserBoot", S_P_STRING},
 	{"CapmcPath", S_P_STRING},
 	{"CapmcPollFreq", S_P_UINT32},
+	{"CapmcRetries", S_P_UINT32},
 	{"CapmcTimeout", S_P_UINT32},
 	{"CnselectPath", S_P_STRING},
 	{"DefaultMCDRAM", S_P_STRING},
@@ -146,6 +149,7 @@ static void _read_config(void)
 	if ((tbl = _config_make_tbl(knl_conf_file))) {
 		(void) s_p_get_string(&capmc_path, "CapmcPath", tbl);
 		(void) s_p_get_uint32(&capmc_poll_freq, "CapmcPollFreq", tbl);
+		(void) s_p_get_uint32(&capmc_retries, "CapmcRetries", tbl);
 		(void) s_p_get_uint32(&capmc_timeout, "CapmcTimeout", tbl);
 		(void) s_p_get_string(&log_file, "LogFile", tbl);
 		(void) s_p_get_string(&syscfg_path, "SyscfgPath", tbl);
@@ -319,7 +323,7 @@ static char *_node_names_2_nid_list(char *node_names)
 static int _update_all_nodes(char *host_list)
 {
 	char *argv[10], *nid_list, *resp_msg;
-	int rc = 0, status = 0;
+	int rc = 0, retry, status = 0;
 
 	nid_list = _node_names_2_nid_list(host_list);
 
@@ -333,18 +337,30 @@ static int _update_all_nodes(char *host_list)
 		argv[4] = "-n";
 		argv[5] = nid_list;
 		argv[6] = NULL;
-		resp_msg = _run_script(argv, &status);
-		if ((status == 0) ||
-		    (resp_msg && strcasestr(resp_msg, "Success"))) {
-			debug("%s: set_mcdram_cfg sent to %s",
-			      prog_name, argv[5]);
-		} else {
+		for (retry = 0; ; retry++) {
+			resp_msg = _run_script(argv, &status);
+			if ((status == 0) ||
+			    (resp_msg && strcasestr(resp_msg, "Success"))) {
+				debug("%s: set_mcdram_cfg sent to %s",
+				      prog_name, argv[5]);
+				xfree(resp_msg);
+				break;
+			}
 			error("%s: capmc(%s,%s,%s,%s,%s): %d %s",
 			      prog_name, argv[1], argv[2], argv[3],
 			      argv[4], argv[5], status, resp_msg);
-			rc = -1;
+			if (resp_msg && strstr(resp_msg, "Could not lookup") &&
+			    (retry <= capmc_retries)) {
+				/* State Manager is down. Sleep and retry */
+				sleep(1);
+				xfree(resp_msg);
+			} else {
+				/* Non-recoverable error */
+				rc = -1;
+				xfree(resp_msg);
+				break;
+			}
 		}
-		xfree(resp_msg);
 	}
 
 	if (numa_mode && (rc == 0)) {
@@ -357,18 +373,30 @@ static int _update_all_nodes(char *host_list)
 		argv[4] = "-n";
 		argv[5] = nid_list;
 		argv[6] = NULL;
-		resp_msg = _run_script(argv, &status);
-		if ((status == 0) ||
-		    (resp_msg && strcasestr(resp_msg, "Success"))) {
-			debug("%s: set_numa_cfg sent to %s",
-			      prog_name, argv[5]);
-		} else {
+		for (retry = 0; ; retry++) {
+			resp_msg = _run_script(argv, &status);
+			if ((status == 0) ||
+			    (resp_msg && strcasestr(resp_msg, "Success"))) {
+				debug("%s: set_numa_cfg sent to %s",
+				      prog_name, argv[5]);
+				xfree(resp_msg);
+				break;
+			}
 			error("%s: capmc(%s,%s,%s,%s,%s): %d %s",
 			      prog_name, argv[1], argv[2], argv[3],
 			      argv[4], argv[5], status, resp_msg);
-			rc = -1;
+			if (resp_msg && strstr(resp_msg, "Could not lookup") &&
+			    (retry <= capmc_retries)) {
+				/* State Manager is down. Sleep and retry */
+				sleep(1);
+				xfree(resp_msg);
+			} else {
+				/* Non-recoverable error */
+				rc = -1;
+				xfree(resp_msg);
+				break;
+			}
 		}
-		xfree(resp_msg);
 	}
 
 	/* Request node restart.
@@ -381,17 +409,29 @@ static int _update_all_nodes(char *host_list)
 		argv[4] = NULL;
 //		argv[4] = "-r";	/* Future option: Reason */
 //		argv[5] = "Change KNL mode";
-		resp_msg = _run_script(argv, &status);
-		if ((status == 0) ||
-		    (resp_msg && strcasestr(resp_msg, "Success"))) {
-			debug("%s: node_reinit sent to %s",
-			      prog_name, argv[3]);
-		} else {
+		for (retry = 0; ; retry++) {
+			resp_msg = _run_script(argv, &status);
+			if ((status == 0) ||
+			    (resp_msg && strcasestr(resp_msg, "Success"))) {
+				debug("%s: node_reinit sent to %s",
+				      prog_name, argv[3]);
+				xfree(resp_msg);
+				break;
+			}
 			error("%s: capmc(%s,%s,%s): %d %s", prog_name,
 			      argv[1], argv[2], argv[3], status, resp_msg);
-			rc = -1;
+			if (resp_msg && strstr(resp_msg, "Could not lookup") &&
+			    (retry <= capmc_retries)) {
+				/* State Manager is down. Sleep and retry */
+				sleep(1);
+				xfree(resp_msg);
+			} else {
+				/* Non-recoverable error */
+				rc = -1;
+				xfree(resp_msg);
+				break;
+			}
 		}
-		xfree(resp_msg);
 	}
 
 	xfree(nid_list);
@@ -483,7 +523,9 @@ int main(int argc, char *argv[])
 	log_opts.stderr_level = LOG_LEVEL_QUIET;
 	log_opts.syslog_level = LOG_LEVEL_QUIET;
 	if (slurm_get_debug_flags() && DEBUG_FLAG_NODE_FEATURES)
-		log_opts.logfile_level += 3;
+		log_opts.logfile_level = LOG_LEVEL_DEBUG;
+	else
+		log_opts.logfile_level = LOG_LEVEL_ERROR;
 	(void) log_init(argv[0], log_opts, LOG_DAEMON, log_file);
 
 	if ((argc < 2) || (argc > 3)) {
@@ -521,13 +563,13 @@ int main(int argc, char *argv[])
 	node_bitmap = bit_alloc(100000);
 	if (_update_all_nodes(argv[1]) != 0) {
 		/* Could not reboot nodes.
-		 * Requeue/hold the job we were trying to start */
+		 * Requeue the job we were trying to start */
 		uint32_t job_id = 0;
 		char *job_id_str = getenv("SLURM_JOB_ID");
 		if (job_id_str)
 			job_id = strtol(job_id_str, NULL, 10);
 		if (job_id)
-			(void) slurm_requeue(job_id, JOB_REQUEUE_HOLD);
+			(void) slurm_requeue(job_id, JOB_RECONFIG_FAIL);
 
 		/* Return the nodes to service */
 		slurm_init_update_node_msg(&node_msg);
diff --git a/contribs/cray/capmc_suspend.c b/contribs/cray/capmc_suspend.c
index 8ee99e680db..1091275d8a8 100644
--- a/contribs/cray/capmc_suspend.c
+++ b/contribs/cray/capmc_suspend.c
@@ -71,6 +71,7 @@
 #define MAX_POLL_WAIT 500
 
 /* Default and minimum timeout parameters for the capmc command */
+#define DEFAULT_CAPMC_RETRIES 4
 #define DEFAULT_CAPMC_TIMEOUT 60000	/* 60 seconds */
 #define MIN_CAPMC_TIMEOUT 1000		/* 1 second */
 
@@ -83,14 +84,11 @@
 /* Static variables */
 static char *capmc_path = NULL;
 static uint32_t capmc_poll_freq = 45;   /* capmc state polling frequency */
+static uint32_t capmc_retries = DEFAULT_CAPMC_RETRIES;
 static uint32_t capmc_timeout = DEFAULT_CAPMC_TIMEOUT;
 static char *log_file = NULL;
 static char *prog_name = NULL;
 
-static pthread_mutex_t thread_cnt_mutex = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t  thread_cnt_cond  = PTHREAD_COND_INITIALIZER;
-static int thread_cnt = 0;
-
 /* NOTE: Keep this table synchronized with the table in
  * src/plugins/node_features/knl_cray/node_features_knl_cray.c */
 static s_p_options_t knl_conf_file_options[] = {
@@ -99,6 +97,7 @@ static s_p_options_t knl_conf_file_options[] = {
 	{"AllowUserBoot", S_P_STRING},
 	{"CapmcPath", S_P_STRING},
 	{"CapmcPollFreq", S_P_UINT32},
+	{"CapmcRetries", S_P_UINT32},
 	{"CapmcTimeout", S_P_UINT32},
 	{"CnselectPath", S_P_STRING},
 	{"DefaultMCDRAM", S_P_STRING},
@@ -146,6 +145,7 @@ static void _read_config(void)
 	if ((tbl = _config_make_tbl(knl_conf_file))) {
 		(void) s_p_get_string(&capmc_path, "CapmcPath", tbl);
 		(void) s_p_get_uint32(&capmc_poll_freq, "CapmcPollFreq", tbl);
+		(void) s_p_get_uint32(&capmc_retries, "CapmcRetries", tbl);
 		(void) s_p_get_uint32(&capmc_timeout, "CapmcTimeout", tbl);
 		(void) s_p_get_string(&log_file, "LogFile", tbl);
 	}
@@ -321,7 +321,7 @@ static char *_node_names_2_nid_list(char *node_names)
 static int _update_all_nodes(char *node_names)
 {
 	char *argv[10], *nid_list, *resp_msg;
-	int rc = -1, status = 0;
+	int rc = 0, retry, status = 0;
 
 	nid_list = _node_names_2_nid_list(node_names);
 	if (nid_list == NULL)
@@ -334,15 +334,31 @@ static int _update_all_nodes(char *node_names)
 	argv[2] = "-n";
 	argv[3] = nid_list;
 	argv[4] = NULL;
-	resp_msg = _run_script(argv, &status);
-	if ((status == 0) ||
-	    (resp_msg && strcasestr(resp_msg, "Success"))) {
-		debug("%s: node_off sent to %s", prog_name, argv[3]);
-		rc = 0;
-	} else {
+	for (retry = 0; ; retry++) {
+		resp_msg = _run_script(argv, &status);
+		if ((status == 0) ||
+		    (resp_msg && strcasestr(resp_msg, "Success"))) {
+			debug("%s: node_off sent to %s", prog_name, argv[3]);
+			xfree(resp_msg);
+			break;
+		}
 		error("%s: capmc(%s,%s,%s): %d %s", prog_name,
 		      argv[1], argv[2], argv[3], status, resp_msg);
+		if (resp_msg && strstr(resp_msg, "Could not lookup") &&
+		    (retry <= capmc_retries)) {
+			/* State Manager is down. Sleep and retry */
+			error("Cray State Manager is down, retrying request");
+			sleep(1);
+			xfree(resp_msg);
+		} else {
+			/* Non-recoverable error */
+			error("Aborting capmc_suspend for %s", nid_list);
+			rc = -1;
+			xfree(resp_msg);
+			break;
+		}
 	}
+
 	xfree(resp_msg);
 	xfree(nid_list);
 	return rc;
@@ -357,23 +373,15 @@ int main(int argc, char *argv[])
 	log_opts.stderr_level = LOG_LEVEL_QUIET;
 	log_opts.syslog_level = LOG_LEVEL_QUIET;
 	if (slurm_get_debug_flags() && DEBUG_FLAG_NODE_FEATURES)
-		log_opts.logfile_level += 3;
+		log_opts.logfile_level = LOG_LEVEL_DEBUG;
+	else
+		log_opts.logfile_level = LOG_LEVEL_ERROR;
 	(void) log_init(argv[0], log_opts, LOG_DAEMON, log_file);
 
 	/* Attempt to shutdown all nodes in a single capmc call. */
 	if (_update_all_nodes(argv[1]) != 0)
 		exit(1);
 
-	/* Wait for work threads to complete */
-	slurm_mutex_lock(&thread_cnt_mutex);
-	while (1) {
-		if (thread_cnt == 0)
-			break;
-		else	/* wait for state change and retry */
-			pthread_cond_wait(&thread_cnt_cond, &thread_cnt_mutex);
-	}
-	slurm_mutex_unlock(&thread_cnt_mutex);
-
 	xfree(prog_name);
 	exit(0);
 }
diff --git a/doc/man/man5/knl.conf.5 b/doc/man/man5/knl.conf.5
index bc32ca9853b..5489bb8e564 100644
--- a/doc/man/man5/knl.conf.5
+++ b/doc/man/man5/knl.conf.5
@@ -92,6 +92,11 @@ changes, in seconds.
 The default value is 45 seconds.
 This parameter is used only by the "knl_cray" plugin.
 
+.TP
+\fBCapmcRetries\fR
+Number of times to retry failed operations of the \fBcapmc\fR program.
+Default value is 4.
+
 .TP
 \fBCapmcTimeout\fR
 Time limit for the \fBcapmc\fR program to return status information milliseconds.
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index 548d205e8f9..8126c9fe77b 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -240,6 +240,9 @@ enum job_states {
 #define	JOB_COMPLETING	  0x00008000 /* Waiting for epilog completion */
 #define JOB_STOPPED       0x00010000 /* Job is stopped state (holding resources,
 					but sent SIGSTOP */
+#define JOB_RECONFIG_FAIL 0x00020000 /* Node configuration for job failed,
+					not job state, just job requeue flag */
+
 
 #define READY_JOB_FATAL	   -2	/* fatal error */
 #define READY_JOB_ERROR    -1	/* ordinary error */
@@ -4461,9 +4464,10 @@ extern void slurm_free_job_array_resp(job_array_resp_msg_t *resp);
  *            valid values are:
  *            0 - if the job has to be requeued in JOB_PENDING state
  *            JOB_SPECIAL_EXIT - if the job has to be requeued in
- *            the special exit state and be held.
+ *                the special exit state and be held.
  *            JOB_REQUEUE_HOLD - if the job has to be requeued in
- *            JOB_PENDING and held state.
+ *                JOB_PENDING and held state.
+ *            JOB_RECONFIG_FAIL - Node configuration for job failed
  * RET 0 or a slurm error code
  */
 extern int slurm_requeue(uint32_t job_id, uint32_t state);
@@ -4477,9 +4481,10 @@ extern int slurm_requeue(uint32_t job_id, uint32_t state);
  *            valid values are:
  *            0 - if the job has to be requeued in JOB_PENDING state
  *            JOB_SPECIAL_EXIT - if the job has to be requeued in
- *            the special exit state and be held.
+ *                the special exit state and be held.
  *            JOB_REQUEUE_HOLD - if the job has to be requeued in
- *            JOB_PENDING and held state.
+ *                JOB_PENDING and held state.
+ *            JOB_RECONFIG_FAIL - Node configuration for job failed
  * OUT resp - per task response to the request,
  *	      free using slurm_free_job_array_resp()
  * RET 0 or a slurm error code
diff --git a/src/plugins/node_features/knl_cray/node_features_knl_cray.c b/src/plugins/node_features/knl_cray/node_features_knl_cray.c
index 956ba8b581a..289758f9439 100644
--- a/src/plugins/node_features/knl_cray/node_features_knl_cray.c
+++ b/src/plugins/node_features/knl_cray/node_features_knl_cray.c
@@ -84,6 +84,7 @@
 #define MAX_POLL_WAIT 500
 
 /* Default and minimum timeout parameters for the capmc command */
+#define DEFAULT_CAPMC_RETRIES 4
 #define DEFAULT_CAPMC_TIMEOUT 60000	/* 60 seconds */
 #define MIN_CAPMC_TIMEOUT 1000		/* 1 second */
 
@@ -154,6 +155,7 @@ List active_feature_list;
 /* Configuration Paramters */
 static char *capmc_path = NULL;
 static uint32_t capmc_poll_freq = 45;	/* capmc state polling frequency */
+static uint32_t capmc_retries = DEFAULT_CAPMC_RETRIES;
 static uint32_t capmc_timeout = 0;	/* capmc command timeout in msec */
 static char *cnselect_path = NULL;
 static bool  debug_flag = false;
@@ -180,6 +182,7 @@ static s_p_options_t knl_conf_file_options[] = {
 	{"AllowUserBoot", S_P_STRING},
 	{"CapmcPath", S_P_STRING},
 	{"CapmcPollFreq", S_P_UINT32},
+	{"CapmcRetries", S_P_UINT32},
 	{"CapmcTimeout", S_P_UINT32},
 	{"CnselectPath", S_P_STRING},
 	{"DefaultMCDRAM", S_P_STRING},
@@ -1522,6 +1525,7 @@ extern int init(void)
 		}
 		(void) s_p_get_string(&capmc_path, "CapmcPath", tbl);
 		(void) s_p_get_uint32(&capmc_poll_freq, "CapmcPollFreq", tbl);
+		(void) s_p_get_uint32(&capmc_retries, "CapmcRetries", tbl);
 		(void) s_p_get_uint32(&capmc_timeout, "CapmcTimeout", tbl);
 		(void) s_p_get_string(&cnselect_path, "CnselectPath", tbl);
 		if (s_p_get_string(&tmp_str, "DefaultMCDRAM", tbl)) {
@@ -1568,6 +1572,7 @@ extern int init(void)
 		info("AllowUserBoot=%s", allow_user_str);
 		info("CapmcPath=%s", capmc_path);
 		info("CapmcPollFreq=%u sec", capmc_poll_freq);
+		info("CapmcRetries=%u", capmc_retries);
 		info("CapmcTimeout=%u msec", capmc_timeout);
 		info("CnselectPath=%s", cnselect_path);
 		info("DefaultMCDRAM=%s DefaultNUMA=%s",
@@ -1613,7 +1618,7 @@ extern int node_features_p_get_node(char *node_list)
 {
 	json_object *j;
 	json_object_iter iter;
-	int i, k, status = 0, rc = SLURM_SUCCESS;
+	int i, k, rc = SLURM_SUCCESS, retry, status = 0;
 	DEF_TIMERS;
 	char *resp_msg, **script_argv;
 	mcdram_cap_t *mcdram_cap = NULL;
@@ -1643,25 +1648,39 @@ extern int node_features_p_get_node(char *node_list)
 	script_argv = xmalloc(sizeof(char *) * 4);	/* NULL terminated */
 	script_argv[0] = xstrdup("capmc");
 	script_argv[1] = xstrdup("get_mcdram_capabilities");
-	START_TIMER;
-	resp_msg = _run_script(capmc_path, script_argv, &status);
-	END_TIMER;
-	if (debug_flag) {
-		info("%s: get_mcdram_capabilities ran for %s",
-		     __func__, TIME_STR);
-	}
-	_log_script_argv(script_argv, resp_msg);
-	_free_script_argv(script_argv);
-	if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
+	for (retry = 0; ; retry++) {
+		START_TIMER;
+		resp_msg = _run_script(capmc_path, script_argv, &status);
+		END_TIMER;
+		if (debug_flag) {
+			info("%s: get_mcdram_capabilities ran for %s",
+			     __func__, TIME_STR);
+		}
+		_log_script_argv(script_argv, resp_msg);
+		if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
+			break;	/* Success */
 		error("%s: get_mcdram_capabilities status:%u response:%s",
 		      __func__, status, resp_msg);
+		if (resp_msg == NULL) {
+			info("%s: get_mcdram_capabilities returned no information",
+			     __func__);
+			_free_script_argv(script_argv);
+			rc = SLURM_ERROR;
+			goto fini;
+		}
+		if (strstr(resp_msg, "Could not lookup") &&
+		    (retry <= capmc_retries)) {
+			/* State Manager is down. Sleep and retry */
+			sleep(1);
+			xfree(resp_msg);
+		} else {
+			xfree(resp_msg);
+			_free_script_argv(script_argv);
+			rc = SLURM_ERROR;
+			goto fini;
+		}
 	}
-	if (resp_msg == NULL) {
-		info("%s: get_mcdram_capabilities returned no information",
-		     __func__);
-		rc = SLURM_ERROR;
-		goto fini;
-	}
+	_free_script_argv(script_argv);
 
 	j = json_tokener_parse(resp_msg);
 	if (j == NULL) {
@@ -1686,22 +1705,39 @@ extern int node_features_p_get_node(char *node_list)
 	script_argv = xmalloc(sizeof(char *) * 4);	/* NULL terminated */
 	script_argv[0] = xstrdup("capmc");
 	script_argv[1] = xstrdup("get_mcdram_cfg");
-	START_TIMER;
-	resp_msg = _run_script(capmc_path, script_argv, &status);
-	END_TIMER;
-	if (debug_flag)
-		info("%s: get_mcdram_cfg ran for %s", __func__, TIME_STR);
-	_log_script_argv(script_argv, resp_msg);
-	_free_script_argv(script_argv);
-	if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
+	for (retry = 0; ; retry++) {
+		START_TIMER;
+		resp_msg = _run_script(capmc_path, script_argv, &status);
+		END_TIMER;
+		if (debug_flag) {
+			info("%s: get_mcdram_cfg ran for %s",
+			     __func__, TIME_STR);
+		}
+		_log_script_argv(script_argv, resp_msg);
+		if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
+			break;	/* Success */
 		error("%s: get_mcdram_cfg status:%u response:%s",
 		      __func__, status, resp_msg);
+		if (resp_msg == NULL) {
+			info("%s: get_mcdram_cfg returned no information",
+			     __func__);
+			_free_script_argv(script_argv);
+			rc = SLURM_ERROR;
+			goto fini;
+		}
+		if (strstr(resp_msg, "Could not lookup") &&
+		    (retry <= capmc_retries)) {
+			/* State Manager is down. Sleep and retry */
+			sleep(1);
+			xfree(resp_msg);
+		} else {
+			xfree(resp_msg);
+			_free_script_argv(script_argv);
+			rc = SLURM_ERROR;
+			goto fini;
+		}
 	}
-	if (resp_msg == NULL) {
-		info("%s: get_mcdram_cfg returned no information", __func__);
-		rc = SLURM_ERROR;
-		goto fini;
-	}
+	_free_script_argv(script_argv);
 
 	j = json_tokener_parse(resp_msg);
 	if (j == NULL) {
@@ -1728,25 +1764,39 @@ extern int node_features_p_get_node(char *node_list)
 	script_argv = xmalloc(sizeof(char *) * 4);	/* NULL terminated */
 	script_argv[0] = xstrdup("capmc");
 	script_argv[1] = xstrdup("get_numa_capabilities");
-	START_TIMER;
-	resp_msg = _run_script(capmc_path, script_argv, &status);
-	END_TIMER;
-	if (debug_flag) {
-		info("%s: get_numa_capabilities ran for %s",
-		     __func__, TIME_STR);
-	}
-	_log_script_argv(script_argv, resp_msg);
-	_free_script_argv(script_argv);
-	if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
+	for (retry = 0; ; retry++) {
+		START_TIMER;
+		resp_msg = _run_script(capmc_path, script_argv, &status);
+		END_TIMER;
+		if (debug_flag) {
+			info("%s: get_numa_capabilities ran for %s",
+			     __func__, TIME_STR);
+		}
+		_log_script_argv(script_argv, resp_msg);
+		if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
+			break;	/* Success */
 		error("%s: get_numa_capabilities status:%u response:%s",
 		      __func__, status, resp_msg);
+		if (resp_msg == NULL) {
+			info("%s: get_numa_capabilities returned no information",
+			     __func__);
+			_free_script_argv(script_argv);
+			rc = SLURM_ERROR;
+			goto fini;
+		}
+		if (strstr(resp_msg, "Could not lookup") &&
+		    (retry <= capmc_retries)) {
+			/* State Manager is down. Sleep and retry */
+			sleep(1);
+			xfree(resp_msg);
+		} else {
+			xfree(resp_msg);
+			_free_script_argv(script_argv);
+			rc = SLURM_ERROR;
+			goto fini;
+		}
 	}
-	if (resp_msg == NULL) {
-		info("%s: get_numa_capabilities returned no information",
-		     __func__);
-		rc = SLURM_ERROR;
-		goto fini;
-	}
+	_free_script_argv(script_argv);
 
 	j = json_tokener_parse(resp_msg);
 	if (j == NULL) {
@@ -1771,22 +1821,37 @@ extern int node_features_p_get_node(char *node_list)
 	script_argv = xmalloc(sizeof(char *) * 4);	/* NULL terminated */
 	script_argv[0] = xstrdup("capmc");
 	script_argv[1] = xstrdup("get_numa_cfg");
-	START_TIMER;
-	resp_msg = _run_script(capmc_path, script_argv, &status);
-	END_TIMER;
-	if (debug_flag)
-		info("%s: get_numa_cfg ran for %s", __func__, TIME_STR);
-	_log_script_argv(script_argv, resp_msg);
-	_free_script_argv(script_argv);
-	if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
+	for (retry = 0; ; retry++) {
+		START_TIMER;
+		resp_msg = _run_script(capmc_path, script_argv, &status);
+		END_TIMER;
+		if (debug_flag)
+			info("%s: get_numa_cfg ran for %s", __func__, TIME_STR);
+		_log_script_argv(script_argv, resp_msg);
+		if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
+			break;	/* Success */
 		error("%s: get_numa_cfg status:%u response:%s",
 		      __func__, status, resp_msg);
+		if (resp_msg == NULL) {
+			info("%s: get_numa_cfg returned no information",
+			     __func__);
+			_free_script_argv(script_argv);
+			rc = SLURM_ERROR;
+			goto fini;
+		}
+		if (strstr(resp_msg, "Could not lookup") &&
+		    (retry <= capmc_retries)) {
+			/* State Manager is down. Sleep and retry */
+			sleep(1);
+			xfree(resp_msg);
+		} else {
+			xfree(resp_msg);
+			_free_script_argv(script_argv);
+			rc = SLURM_ERROR;
+			goto fini;
+		}
 	}
-	if (resp_msg == NULL) {
-		info("%s: get_numa_cfg returned no information", __func__);
-		rc = SLURM_ERROR;
-		goto fini;
-	}
+	_free_script_argv(script_argv);
 
 	j = json_tokener_parse(resp_msg);
 	if (j == NULL) {
-- 
GitLab