From 06115b75e1e6e724ae5bd263d8519706c6ea92b4 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@llnl.gov>
Date: Tue, 3 Apr 2007 00:52:40 +0000
Subject: [PATCH] Another fix to make sure steps with requested nodes have
 correct cpus accounted for and a fix to make sure the user can't allocate
 more cpus than the have requested.

---
 NEWS                           |  3 +++
 src/common/slurm_step_layout.c | 14 ++------------
 src/slurmctld/step_mgr.c       | 23 ++++++++++++++---------
 src/srun/launch.c              | 12 +++++++++++-
 4 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/NEWS b/NEWS
index 74a667d2c53..7e93af11a81 100644
--- a/NEWS
+++ b/NEWS
@@ -10,6 +10,9 @@ documents those changes that are of interest to users and admins.
     but doesn't ever get the message from the sender (network issue or
     something) also check to make sure if we get something back we make sure
     we account for everything we sent out before we call it good.
+ -- Another fix to make sure steps with requested nodes have correct cpus
+    accounted for and a fix to make sure the user can't allocate more 
+    cpus than the have requested.
 
 * Changes in SLURM 1.2.3
 ========================
diff --git a/src/common/slurm_step_layout.c b/src/common/slurm_step_layout.c
index 1818bdafe6b..555f7d65fea 100644
--- a/src/common/slurm_step_layout.c
+++ b/src/common/slurm_step_layout.c
@@ -404,8 +404,6 @@ static int _init_task_layout(slurm_step_layout_t *step_layout,
 	
 	step_layout->plane_size = plane_size;
 
-/* 	step_layout->node_addr = xmalloc(sizeof(slurm_addr)  */
-/* 				     * step_layout->node_cnt); */
 	step_layout->tasks = xmalloc(sizeof(uint16_t) 
 				     * step_layout->node_cnt);
 	step_layout->tids  = xmalloc(sizeof(uint32_t *) 
@@ -432,19 +430,11 @@ static int _init_task_layout(slurm_step_layout_t *step_layout,
 /* 			error("hostlist incomplete for this job request"); */
 /* 			hostlist_destroy(hl); */
 /* 			return SLURM_ERROR; */
-/* 		} */
-/* 		if(slurm_conf_get_addr(name, &step_layout->node_addr[i]) */
-/* 		   == SLURM_ERROR) { */
-/* 			error("_init_task_layout: can't get addr for " */
-/* 			      "host %s", name); */
-/* 			free(name); */
-/* 			continue; */
-/* 		} */
-							
+/* 		} */							
 /* 		debug2("host %d = %s", i, name); */
 /* 		free(name); */
 		cpus[i] = cpus_per_node[cpu_inx];
-		
+		//info("got %d cpus", cpus[i]);
 		if ((++cpu_cnt) >= cpu_count_reps[cpu_inx]) {
 			/* move to next record */
 			cpu_inx++;
diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index 5d6f9277bdb..151607a6a4b 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -525,7 +525,6 @@ _pick_step_nodes (struct job_record  *job_ptr,
 		step_spec->cpu_count = 0;
 	}
 
-	
 	if (step_spec->node_count) {
 		nodes_picked_cnt = bit_set_count(nodes_picked);
 		if (nodes_idle 
@@ -561,6 +560,12 @@ _pick_step_nodes (struct job_record  *job_ptr,
 	
 	if (step_spec->cpu_count) {
 		cpus_picked_cnt = count_cpus(nodes_picked);
+		/* person is requesting more cpus than we got from the
+		   picked nodes we should return with an error */
+		if(step_spec->cpu_count > cpus_picked_cnt) {
+			goto cleanup;
+		}
+		
 		if (nodes_idle
 		    &&  (step_spec->cpu_count > cpus_picked_cnt)) {
 			int first_bit, last_bit;
@@ -701,7 +706,7 @@ step_create(job_step_create_request_msg_t *step_specs,
 	if (nodeset == NULL)
 		return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE ;
 	node_count = bit_set_count(nodeset);
-	
+
 	if (step_specs->num_tasks == NO_VAL) {
 		if (step_specs->cpu_count != NO_VAL)
 			step_specs->num_tasks = step_specs->cpu_count;
@@ -734,7 +739,7 @@ step_create(job_step_create_request_msg_t *step_specs,
 		xfree(step_specs->node_list);
 		step_specs->node_list = xstrdup(step_node_list);
 	}
-
+	
 	step_ptr->step_node_bitmap = nodeset;
 	
 	switch(step_specs->task_dist) {
@@ -811,27 +816,27 @@ extern slurm_step_layout_t *step_layout_create(struct step_record *step_ptr,
 	int pos = -1;
 	struct job_record *job_ptr = step_ptr->job_ptr;
 	uint32_t node_cnt = job_ptr->cpu_count_reps[inx];
-	
+			
 	/* build the cpus-per-node arrays for the subset of nodes
 	   used by this job step */
 	for (i = 0; i < node_record_count; i++) {
 		if (bit_test(step_ptr->step_node_bitmap, i)) {
-			pos = bit_get_pos_num(step_ptr->step_node_bitmap, i);
+			/* find out the position in the job */
+			pos = bit_get_pos_num(job_ptr->node_bitmap, i);
 			if (pos == -1)
 				return NULL;
 			while(pos >= node_cnt) {
 				node_cnt += 
 					job_ptr->cpu_count_reps[++inx];
 			}
-			debug2("got inx of %d cpus = %d pos = %d", 
-			       inx, job_ptr->cpus_per_node[inx], pos);
+			debug2("%d got inx of %d cpus = %d pos = %d", 
+			       i, inx, job_ptr->cpus_per_node[inx], pos);
 			usable_cpus = job_ptr->cpus_per_node[inx];
 			
 			
 			//if(cpus_per_node[cpu_inx] != usable_cpus) {
 			if ((cpu_inx == -1) ||
-			    (cpus_per_node[cpu_inx] !=
-			     usable_cpus)) {
+			    (cpus_per_node[cpu_inx] != usable_cpus)) {
 				cpu_inx++;
 				
 				cpus_per_node[cpu_inx] = usable_cpus;
diff --git a/src/srun/launch.c b/src/srun/launch.c
index 94249c60e1c..cbb8d0799c9 100644
--- a/src/srun/launch.c
+++ b/src/srun/launch.c
@@ -220,7 +220,17 @@ launch(void *arg)
 		      rc, ret_data->err, ret_data->type);
 		nodeid = nodelist_find(job->step_layout->node_list,
 				       ret_data->node_name);
-		if (rc != SLURM_SUCCESS) {
+		
+		if(nodeid >= job->step_layout->node_cnt) {
+			/* Make sure we aren't trying to mark
+			something we haven't requested but was
+			included in the nodelist.  This should never
+			happen */
+			error("got a problem with a non requested "
+			      "node %s(%d): %s",
+			      ret_data->node_name, nodeid, 
+			      slurm_strerror(rc));
+		} else if (rc != SLURM_SUCCESS) {
 			slurm_seterrno(rc);
 			error("Task launch failed on node %s(%d): %s",
 			      ret_data->node_name, nodeid, 
-- 
GitLab