From 7a82a837f220759c9472036482b4f17f24fbae27 Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Tue, 11 Oct 2016 17:03:11 -0600
Subject: [PATCH] node_features/knl_cray: check capmc node_status more

Execute "capmc node_status" at more frequent intervals to handle
  nodes getting added or removed from the system using Cray tools
  (i.e. try to keep Slurm and Cray software better synchronized).
---
 .../knl_cray/node_features_knl_cray.c         | 129 +++++++++---------
 1 file changed, 62 insertions(+), 67 deletions(-)

diff --git a/src/plugins/node_features/knl_cray/node_features_knl_cray.c b/src/plugins/node_features/knl_cray/node_features_knl_cray.c
index 972f0b76d54..85fb202c0cc 100644
--- a/src/plugins/node_features/knl_cray/node_features_knl_cray.c
+++ b/src/plugins/node_features/knl_cray/node_features_knl_cray.c
@@ -1725,83 +1725,78 @@ static void _check_node_status(void)
 	struct node_record *node_ptr;
 	DEF_TIMERS;
 
-	if (!capmc_node_bitmap) {
-		script_argv = xmalloc(sizeof(char *) * 4); /* NULL terminated */
-		script_argv[0] = xstrdup("capmc");
-		script_argv[1] = xstrdup("node_status");
-		for (retry = 0; ; retry++) {
-			START_TIMER;
-			resp_msg = _run_script(capmc_path, script_argv, &status);
-			END_TIMER;
-			if (debug_flag) {
-				info("%s: node_status ran for %s",
-				     __func__, TIME_STR);
-			}
-			_log_script_argv(script_argv, resp_msg);
-			if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
-				break;	/* Success */
-			error("%s: node_status status:%u response:%s",
-			      __func__, status, resp_msg);
-			if (resp_msg == NULL) {
-				info("%s: node_status returned no information",
-				     __func__);
-				_free_script_argv(script_argv);
-				return;
-			}
-			if (strstr(resp_msg, "Could not lookup") &&
-			    (retry <= capmc_retries)) {
-				/* State Manager is down. Sleep and retry */
-				sleep(1);
-				xfree(resp_msg);
-			} else {
-				xfree(resp_msg);
-				_free_script_argv(script_argv);
-				return;
-			}
+	script_argv = xmalloc(sizeof(char *) * 4); /* NULL terminated */
+	script_argv[0] = xstrdup("capmc");
+	script_argv[1] = xstrdup("node_status");
+	for (retry = 0; ; retry++) {
+		START_TIMER;
+		resp_msg = _run_script(capmc_path, script_argv, &status);
+		END_TIMER;
+		if (debug_flag)
+			info("%s: node_status ran for %s", __func__, TIME_STR);
+		_log_script_argv(script_argv, resp_msg);
+		if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
+			break;	/* Success */
+		error("%s: node_status status:%u response:%s",
+		      __func__, status, resp_msg);
+		if (resp_msg == NULL) {
+			info("%s: node_status returned no information",
+			     __func__);
+			_free_script_argv(script_argv);
+			return;
 		}
-		_free_script_argv(script_argv);
-
-		j_obj = json_tokener_parse(resp_msg);
-		if (j_obj == NULL) {
-			error("%s: json parser failed on %s", __func__,
-			      resp_msg);
+		if (strstr(resp_msg, "Could not lookup") &&
+		    (retry <= capmc_retries)) {
+			/* State Manager is down. Sleep and retry */
+			sleep(1);
+			xfree(resp_msg);
+		} else {
 			xfree(resp_msg);
+			_free_script_argv(script_argv);
 			return;
 		}
+	}
+	_free_script_argv(script_argv);
 
-		capmc_node_bitmap = bit_alloc(100000);
-		json_object_object_foreachC(j_obj, iter) {
-			/* NOTE: The error number "e" and message "err_msg"
-			 * fields are currently ignored. */
-			if (!xstrcmp(iter.key, "e") ||
-			    !xstrcmp(iter.key, "err_msg"))
-				continue;
-			if (json_object_get_type(iter.val) != json_type_array)
-				continue;
-			json_object_object_get_ex(j_obj, iter.key, &j_array);
-			if (!j_array) {
+	j_obj = json_tokener_parse(resp_msg);
+	if (j_obj == NULL) {
+		error("%s: json parser failed on %s", __func__, resp_msg);
+		xfree(resp_msg);
+		return;
+	}
+
+	FREE_NULL_BITMAP(capmc_node_bitmap);
+	capmc_node_bitmap = bit_alloc(100000);
+	json_object_object_foreachC(j_obj, iter) {
+		/* NOTE: The error number "e" and message "err_msg"
+		 * fields are currently ignored. */
+		if (!xstrcmp(iter.key, "e") ||
+		    !xstrcmp(iter.key, "err_msg"))
+			continue;
+		if (json_object_get_type(iter.val) != json_type_array)
+			continue;
+		json_object_object_get_ex(j_obj, iter.key, &j_array);
+		if (!j_array) {
+			error("%s: Unable to parse nid specification",
+			      __func__);
+			FREE_NULL_BITMAP(capmc_node_bitmap);
+			return;
+		}
+		num_ent = json_object_array_length(j_array);
+		for (i = 0; i < num_ent; i++) {
+			j_value = json_object_array_get_idx(j_array, i);
+			if (json_object_get_type(j_value) !=
+			    json_type_int) {
 				error("%s: Unable to parse nid specification",
 				      __func__);
-				FREE_NULL_BITMAP(capmc_node_bitmap);
-				return;
-			}
-			num_ent = json_object_array_length(j_array);
-			for (i = 0; i < num_ent; i++) {
-				j_value = json_object_array_get_idx(j_array, i);
-				if (json_object_get_type(j_value) !=
-				    json_type_int) {
-					error("%s: Unable to parse nid specification",
-					      __func__);
-				} else {
-					nid = json_object_get_int64(j_value);
-					if ((nid >= 0) && (nid < 100000))
-						bit_set(capmc_node_bitmap, nid);
-				}
-
+			} else {
+				nid = json_object_get_int64(j_value);
+				if ((nid >= 0) && (nid < 100000))
+					bit_set(capmc_node_bitmap, nid);
 			}
 		}
-		json_object_put(j_obj);	/* Frees json memory */
 	}
+	json_object_put(j_obj);	/* Frees json memory */
 
 	for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
 	     i++, node_ptr++) {
-- 
GitLab