From 5b61a4648b8d1de5b7a1c01f7cc32d01dbbdc999 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Mon, 4 May 2009 22:23:45 +0000
Subject: [PATCH] Avoid using powered down nodes when scheduling work if
 possible.     Fix possible invalid memory reference in power save logic.

---
 NEWS                           |  6 ++--
 src/slurmctld/agent.c          |  3 +-
 src/slurmctld/node_mgr.c       |  2 ++
 src/slurmctld/node_scheduler.c | 53 ++++++++++++++++++++++++++++++++--
 src/slurmctld/power_save.c     |  2 ++
 src/slurmctld/read_config.c    | 24 ++++++++-------
 src/slurmctld/slurmctld.h      |  2 ++
 7 files changed, 75 insertions(+), 17 deletions(-)

diff --git a/NEWS b/NEWS
index 050a9baf0ee..6b48df4d989 100644
--- a/NEWS
+++ b/NEWS
@@ -18,9 +18,11 @@ documents those changes that are of interest to users and admins.
     compute node).
  -- Add slurmctld and slurmd binding to appropriate communications address
     based upon NodeAddr, ControllerAddr and BackupAddr configuration 
-    parameters. Patch from Matthieu Hautreux, CEA.
-    NOTE: Failing on Debian Linux. 
+    parameters. Based upon patch from Matthieu Hautreux, CEA.
+    NOTE: Fails when SlurmDBD is configured with some configurations.
     NOTE: You must define BIND_SPECIFIC_ADDR to enable this option.
+ -- Avoid using powered down nodes when scheduling work if possible. 
+    Fix possible invalid memory reference in power save logic.
 
 * Changes in SLURM 1.4.0-pre13
 ==============================
diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c
index 52572bb5384..7c9fdad50ee 100644
--- a/src/slurmctld/agent.c
+++ b/src/slurmctld/agent.c
@@ -1497,7 +1497,8 @@ static bool _batch_launch_defer(queued_request_t *queued_req_ptr)
 	} else if (difftime(now, queued_req_ptr->first_attempt) >= 
 				slurm_get_batch_start_timeout()) {
 		error("agent waited too long for node %s to come up, "
-		      "sending batch request anyway...");
+		      "sending batch request anyway...", 
+		      node_ptr->name);
 		queued_req_ptr->last_attempt = (time_t) 0;
 		return false;
 	}
diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
index 197c64a3968..ad9e15ccd2d 100644
--- a/src/slurmctld/node_mgr.c
+++ b/src/slurmctld/node_mgr.c
@@ -88,6 +88,7 @@ time_t last_node_update = (time_t) NULL;	/* time of last update to
 						 * node records */
 bitstr_t *avail_node_bitmap = NULL;	/* bitmap of available nodes */
 bitstr_t *idle_node_bitmap  = NULL;	/* bitmap of idle nodes */
+bitstr_t *power_node_bitmap = NULL;	/* bitmap of powered down nodes */
 bitstr_t *share_node_bitmap = NULL;  	/* bitmap of sharable nodes */
 bitstr_t *up_node_bitmap    = NULL;  	/* bitmap of non-down nodes */
 
@@ -2640,6 +2641,7 @@ void node_fini(void)
 
 	FREE_NULL_BITMAP(idle_node_bitmap);
 	FREE_NULL_BITMAP(avail_node_bitmap);
+	FREE_NULL_BITMAP(power_node_bitmap);
 	FREE_NULL_BITMAP(share_node_bitmap);
 	FREE_NULL_BITMAP(up_node_bitmap);
 
diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c
index 8cf681a6a81..740866719ee 100644
--- a/src/slurmctld/node_scheduler.c
+++ b/src/slurmctld/node_scheduler.c
@@ -1241,17 +1241,18 @@ static int _build_node_list(struct job_record *job_ptr,
 			    struct node_set **node_set_pptr,
 			    int *node_set_size)
 {
-	int node_set_inx, rc;
+	int i, node_set_inx, power_cnt, rc;
 	struct node_set *node_set_ptr;
 	struct config_record *config_ptr;
 	struct part_record *part_ptr = job_ptr->part_ptr;
 	ListIterator config_iterator;
 	int check_node_config, config_filter = 0;
 	struct job_details *detail_ptr = job_ptr->details;
-	bitstr_t *usable_node_mask = NULL;
+	bitstr_t *power_up_bitmap = NULL, *usable_node_mask = NULL;
 	multi_core_data_t *mc_ptr = detail_ptr->mc_ptr;
 	bitstr_t *tmp_feature;
 	time_t when = time(NULL);
+	uint32_t max_weight = 0;
 
 	if (job_ptr->resv_name) {
 		/* Limit node selection to those in selected reservation */
@@ -1359,9 +1360,10 @@ static int _build_node_list(struct job_record *job_ptr,
 		node_set_ptr[node_set_inx].cpus_per_node =
 			config_ptr->cpus;
 		node_set_ptr[node_set_inx].real_memory =
-			config_ptr->real_memory;		
+			config_ptr->real_memory;
 		node_set_ptr[node_set_inx].weight =
 			config_ptr->weight;
+		max_weight = MAX(max_weight, config_ptr->weight);
 		node_set_ptr[node_set_inx].features = 
 			xstrdup(config_ptr->feature);
 		node_set_ptr[node_set_inx].feature_array = 
@@ -1389,6 +1391,51 @@ static int _build_node_list(struct job_record *job_ptr,
 		return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
 	}
 
+	/* If any nodes are powered down, put them into a new node_set
+	 * record with a higher scheduling weight (avoids using powered
+	 * down nodes where possible). */
+	for (i = (node_set_inx-1); i >= 0; i--) {
+		power_cnt = bit_overlap(node_set_ptr[i].my_bitmap,
+				        power_node_bitmap);
+		if (power_cnt == 0)
+			continue;	/* no nodes powered down */
+		if (power_cnt == node_set_ptr[i].nodes) {
+			node_set_ptr[i].weight += max_weight;
+			continue;	/* all nodes powered down */
+		}
+
+		/* Some nodes powered down, others up, split record */
+		node_set_ptr[node_set_inx].cpus_per_node =
+			node_set_ptr[i].cpus_per_node;
+		node_set_ptr[node_set_inx].real_memory =
+			node_set_ptr[i].real_memory;
+		node_set_ptr[node_set_inx].nodes = power_cnt;
+		node_set_ptr[i].nodes -= power_cnt;
+		node_set_ptr[node_set_inx].weight =
+			node_set_ptr[i].weight += max_weight;
+		node_set_ptr[node_set_inx].features =
+			xstrdup(node_set_ptr[i].features);
+		node_set_ptr[node_set_inx].feature_array =
+			node_set_ptr[i].feature_array;
+		node_set_ptr[node_set_inx].feature_bits =
+			bit_copy(node_set_ptr[i].feature_bits);
+		node_set_ptr[node_set_inx].my_bitmap = 
+			bit_copy(node_set_ptr[i].my_bitmap);
+		bit_and(node_set_ptr[node_set_inx].my_bitmap,
+			power_node_bitmap);
+		if (power_up_bitmap == NULL) {
+			power_up_bitmap = bit_copy(power_node_bitmap);
+			bit_not(power_up_bitmap);
+		}
+		bit_and(node_set_ptr[i].my_bitmap, power_up_bitmap);
+
+		node_set_inx++;
+		xrealloc(node_set_ptr,
+			 sizeof(struct node_set) * (node_set_inx + 2));
+		node_set_ptr[node_set_inx + 1].my_bitmap = NULL;
+	}
+	FREE_NULL_BITMAP(power_up_bitmap);
+
 	*node_set_size = node_set_inx;
 	*node_set_pptr = node_set_ptr;
 	return SLURM_SUCCESS;
diff --git a/src/slurmctld/power_save.c b/src/slurmctld/power_save.c
index 80465350fad..17d2af7146a 100644
--- a/src/slurmctld/power_save.c
+++ b/src/slurmctld/power_save.c
@@ -119,6 +119,7 @@ static void _do_power_work(void)
 			wake_cnt++;
 			suspend_cnt++;
 			node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
+			bit_clear(power_node_bitmap, i);
 			node_ptr->node_state |=   NODE_STATE_NO_RESPOND;
 			node_ptr->last_response = now;
 			bit_set(wake_node_bitmap, i);
@@ -137,6 +138,7 @@ static void _do_power_work(void)
 			sleep_cnt++;
 			resume_cnt++;
 			node_ptr->node_state |= NODE_STATE_POWER_SAVE;
+			bit_clear(power_node_bitmap, i);
 			bit_set(sleep_node_bitmap, i);
 		}
 	}
diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c
index 1d6cc344c73..f97757648a6 100644
--- a/src/slurmctld/read_config.c
+++ b/src/slurmctld/read_config.c
@@ -183,14 +183,17 @@ static int _build_bitmaps(void)
 	/* initialize the idle and up bitmaps */
 	FREE_NULL_BITMAP(idle_node_bitmap);
 	FREE_NULL_BITMAP(avail_node_bitmap);
+	FREE_NULL_BITMAP(power_node_bitmap);
 	FREE_NULL_BITMAP(share_node_bitmap);
 	FREE_NULL_BITMAP(up_node_bitmap);
 	idle_node_bitmap  = (bitstr_t *) bit_alloc(node_record_count);
 	avail_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
+	power_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
 	share_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
 	up_node_bitmap    = (bitstr_t *) bit_alloc(node_record_count);
 	if ((idle_node_bitmap     == NULL) ||
 	    (avail_node_bitmap    == NULL) ||
+	    (power_node_bitmap    == NULL) ||
 	    (share_node_bitmap    == NULL) ||
 	    (up_node_bitmap       == NULL)) 
 		fatal ("bit_alloc malloc failure");
@@ -235,17 +238,15 @@ static int _build_bitmaps(void)
 	 * their configuration, resync DRAINED vs. DRAINING state */
 	for (i = 0; i < node_record_count; i++) {
 		uint16_t base_state, drain_flag, no_resp_flag, job_cnt;
+		struct node_record *node_ptr = node_record_table_ptr + i;
 
-		if (node_record_table_ptr[i].name[0] == '\0')
+		if (node_ptr->name[0] == '\0')
 			continue;	/* defunct */
-		base_state = node_record_table_ptr[i].node_state & 
-				NODE_STATE_BASE;
-		drain_flag = node_record_table_ptr[i].node_state &
+		base_state = node_ptr->node_state & NODE_STATE_BASE;
+		drain_flag = node_ptr->node_state &
 				(NODE_STATE_DRAIN | NODE_STATE_FAIL);
-		no_resp_flag = node_record_table_ptr[i].node_state & 
-				NODE_STATE_NO_RESPOND;
-		job_cnt = node_record_table_ptr[i].run_job_cnt +
-		          node_record_table_ptr[i].comp_job_cnt;
+		no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND;
+		job_cnt = node_ptr->run_job_cnt + node_ptr->comp_job_cnt;
 
 		if (((base_state == NODE_STATE_IDLE) && (job_cnt == 0))
 		||  (base_state == NODE_STATE_DOWN))
@@ -256,9 +257,10 @@ static int _build_bitmaps(void)
 				bit_set(avail_node_bitmap, i);
 			bit_set(up_node_bitmap, i);
 		}
-		if (node_record_table_ptr[i].config_ptr)
-			bit_set(node_record_table_ptr[i].config_ptr->
-				node_bitmap, i);
+		if (node_ptr->node_state & NODE_STATE_POWER_SAVE)
+			bit_set(power_node_bitmap, i);
+		if (node_ptr->config_ptr)
+			bit_set(node_ptr->config_ptr->node_bitmap, i);
 	}
 	return error_code;
 }
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index c39b9796f4b..bd070ccaea0 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -251,6 +251,7 @@ extern bool ping_nodes_now;		/* if set, ping nodes immediately */
  *  avail_node_bitmap       Set if node's state is not DOWN, DRAINING/DRAINED, 
  *                          FAILING or NO_RESPOND (i.e. available to run a job)
  *  idle_node_bitmap        Set if node has no jobs allocated to it
+ *  power_node_bitmap       Set for nodes which are powered down
  *  share_node_bitmap       Set if any job allocated resources on that node
  *                          is configured to not share the node with other 
  *                          jobs (--exclusive option specified by job or
@@ -260,6 +261,7 @@ extern bool ping_nodes_now;		/* if set, ping nodes immediately */
 extern bitstr_t *avail_node_bitmap;	/* bitmap of available nodes, 
 					 * state not DOWN, DRAIN or FAILING */
 extern bitstr_t *idle_node_bitmap;	/* bitmap of idle nodes */
+extern bitstr_t *power_node_bitmap;	/* Powered down nodes */
 extern bitstr_t *share_node_bitmap;	/* bitmap of sharable nodes */
 extern bitstr_t *up_node_bitmap;	/* bitmap of up nodes, not DOWN */
 
-- 
GitLab