From c623b34d4b8cd5c3ca3239df1d8e31ef86b39df4 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@llnl.gov>
Date: Fri, 4 Aug 2006 15:05:45 +0000
Subject: [PATCH] svn merge -r8747:8772
 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1

---
 NEWS                      |   2 +
 src/slurmctld/agent.c     |   1 +
 src/srun/allocate.c       |  25 ++++++--
 src/srun/srun.c           |   6 ++
 src/srun/srun_job.c       | 129 +++++++++++++++++++++++++++++++-------
 testsuite/expect/globals  |   2 +-
 testsuite/expect/test1.87 |  41 +++---------
 7 files changed, 146 insertions(+), 60 deletions(-)

diff --git a/NEWS b/NEWS
index 58349349abf..2f61ce9fb97 100644
--- a/NEWS
+++ b/NEWS
@@ -33,6 +33,8 @@ documents those changes that are of interest to users and admins.
  -- Fixed heterogeous cpu overcommit issue
  -- Fix bug where srun would hang if it ran on one node and that 
     node's slurmd died
+ -- Fix bug where srun task layout would be bad when min-max node range is 
+    specified (e.g. "srun -N1-4 ...")
 
 * Changes in SLURM 1.1.5
 ========================
diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c
index 391277b3c5d..471bacd93a0 100644
--- a/src/slurmctld/agent.c
+++ b/src/slurmctld/agent.c
@@ -791,6 +791,7 @@ static void *_thread_per_group_rpc(void *args)
 		if (job_ptr)
 			state = job_ptr->job_state;	
 		unlock_slurmctld(job_read_lock);
+
 		if ((state == JOB_RUNNING)
 		    || (msg_type == SRUN_JOB_COMPLETE)
 		    || ((msg_type == SRUN_NODE_FAIL)
diff --git a/src/srun/allocate.c b/src/srun/allocate.c
index 5034c5e23bf..8bad0ba489d 100644
--- a/src/srun/allocate.c
+++ b/src/srun/allocate.c
@@ -551,10 +551,27 @@ static job_step_create_request_msg_t *
 _step_req_create(srun_job_t *j)
 {
 	job_step_create_request_msg_t *r = xmalloc(sizeof(*r));
+	hostlist_t hl;
 	r->job_id     = j->jobid;
 	r->user_id    = opt.uid;
-	r->node_count = j->nhosts;
-	r->cpu_count  = opt.overcommit ? j->nhosts
+
+	/* get the correct number of hosts to run tasks on */
+	if(opt.nodelist) {
+		hl = hostlist_create(opt.nodelist);
+		hostlist_uniq(hl);
+		r->node_count = hostlist_count(hl);
+		hostlist_destroy(hl);
+	} else if((opt.max_nodes > 0) && (opt.max_nodes <j->nhosts))
+		r->node_count = opt.max_nodes;
+	else 
+		r->node_count = j->nhosts;
+	/* info("send %d or %d? sending %d", opt.max_nodes, */
+/* 		     j->nhosts, r->node_count); */
+	if(r->node_count > j->nhosts) {
+		error("Asking for more nodes that allocated");
+		return NULL;
+	}
+	r->cpu_count  = opt.overcommit ? r->node_count
 		                       : (opt.nprocs*opt.cpus_per_task);
 	r->num_tasks  = opt.nprocs;
 	r->node_list  = xstrdup(opt.nodelist);
@@ -578,8 +595,8 @@ _step_req_create(srun_job_t *j)
 		r->task_dist = SLURM_DIST_CYCLIC;
 		break;
 	default:
-		r->task_dist = ((opt.nprocs <= j->nhosts)
-				? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK);
+		r->task_dist = (opt.nprocs <= r->node_count) 
+			? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK;
 		break;
 
 	}
diff --git a/src/srun/srun.c b/src/srun/srun.c
index e89e3dd302a..547149063c0 100644
--- a/src/srun/srun.c
+++ b/src/srun/srun.c
@@ -354,6 +354,10 @@ int srun(int ac, char **av)
 		info("Force Terminated job");
 		srun_job_destroy(job, 0);
 		exit(1);
+	} else if (job->state == SRUN_JOB_CANCELLED) {
+		info("Cancelling job");
+		srun_job_destroy(job, NO_VAL);
+		exit(1);
 	} 
 
 	/*
@@ -365,6 +369,8 @@ int srun(int ac, char **av)
 		error ("Waiting on message thread: %m");
 	debug("done");
 	
+	/* have to check if job was cancelled here just to make sure 
+	   state didn't change when we were waiting for the message thread */
 	if (job->state == SRUN_JOB_CANCELLED) {
 		info("Cancelling job");
 		srun_job_destroy(job, NO_VAL);
diff --git a/src/srun/srun_job.c b/src/srun/srun_job.c
index 10e095ebbc5..78335f3234f 100644
--- a/src/srun/srun_job.c
+++ b/src/srun/srun_job.c
@@ -144,9 +144,10 @@ job_step_create_allocation(uint32_t job_id)
 {
 	srun_job_t *job = NULL;
 	allocation_info_t *ai = xmalloc(sizeof(*ai));
-	uint32_t cpn = 1;
 	hostlist_t hl = NULL;
 	char buf[8192];
+	int count = 0;
+	char *tasks_per_node = xstrdup(getenv("SLURM_TASKS_PER_NODE"));
 	
 	ai->jobid          = job_id;
 	ai->stepid         = NO_VAL;
@@ -168,13 +169,21 @@ job_step_create_allocation(uint32_t job_id)
 			}
 		}
 	}
-	
 	ai->nodelist       = opt.alloc_nodelist;
+	/* hl = hostlist_create(ai->nodelist); */
+/* 	hostlist_uniq(hl); */
+/* 	ai->nnodes = hostlist_count(hl); */
+/* 	hostlist_destroy(hl); */
+/* 	info("using %s %d not %d", ai->nodelist, ai->nnodes, opt.min_nodes); */
+
 	if (opt.exc_nodes) {
-		hl = hostlist_create(ai->nodelist);
 		hostlist_t exc_hl = hostlist_create(opt.exc_nodes);
 		char *node_name = NULL;
-
+		if(opt.nodelist)
+			hl = hostlist_create(opt.nodelist);
+		else
+			hl = hostlist_create(ai->nodelist);
+		info("using %s or %s", opt.nodelist, ai->nodelist);
 		while ((node_name = hostlist_shift(exc_hl))) {
 			int inx = hostlist_find(hl, node_name);
 			if (inx >= 0) {
@@ -183,9 +192,15 @@ job_step_create_allocation(uint32_t job_id)
 			}
 			free(node_name);
 		}
+		if(!hostlist_count(hl)) {
+			error("Hostlist is now nothing!  Can't run job.");
+			return NULL;
+		}
 		hostlist_destroy(exc_hl);
 		hostlist_ranged_string(hl, sizeof(buf), buf);
 		hostlist_destroy(hl);
+		xfree(opt.nodelist);
+		opt.nodelist = xstrdup(buf);
 		xfree(ai->nodelist);
 		ai->nodelist = xstrdup(buf);
 	}
@@ -194,24 +209,86 @@ job_step_create_allocation(uint32_t job_id)
 /* 		opt.nodelist = ai->nodelist; */
 	if(opt.nodelist) { 
 		hl = hostlist_create(opt.nodelist);
+		if(!hostlist_count(hl)) {
+			error("1 Hostlist is now nothing!  Can't run job.");
+			return NULL;
+		}
 		hostlist_ranged_string(hl, sizeof(buf), buf);
+		count = hostlist_count(hl);
 		hostlist_destroy(hl);
 		xfree(ai->nodelist);
 		ai->nodelist = xstrdup(buf);
 		xfree(opt.nodelist);
 		opt.nodelist = xstrdup(buf);
 	}
-	ai->nnodes         = opt.min_nodes;
-	debug("node list is now %s", ai->nodelist);
-	cpn = (opt.nprocs + ai->nnodes - 1) / ai->nnodes;
-	ai->cpus_per_node  = &cpn;
-	ai->cpu_count_reps = &ai->nnodes;
+	if(opt.distribution == SLURM_DIST_ARBITRARY) {
+		if(count != opt.nprocs) {
+			error("You asked for %d tasks but specified %d nodes",
+			      opt.nprocs, count);
+			goto error;
+		}
+	}
+
+	hl = hostlist_create(ai->nodelist);
+	hostlist_uniq(hl);
+	ai->nnodes = hostlist_count(hl);
+	hostlist_destroy(hl);
 	
+	//ai->nnodes         = opt.min_nodes;
+	/* info("node list is now %s %s %d procs",  */
+/* 	     ai->nodelist, opt.nodelist, */
+/* 	     opt.nprocs); */
+	if(tasks_per_node) {
+		int i = 0;
+		
+		ai->num_cpu_groups = 0;
+		ai->cpus_per_node = xmalloc(sizeof(uint32_t) * ai->nnodes);
+		ai->cpu_count_reps =xmalloc(sizeof(uint32_t) * ai->nnodes);
+		
+		while(tasks_per_node[i]) {
+			if(tasks_per_node[i] >= '0' 
+			   && tasks_per_node[i] <= '9')
+				ai->cpus_per_node[ai->num_cpu_groups] =
+					atoi(&tasks_per_node[i]);
+			else {
+				error("problem with tasks_per_node %s", 
+				      tasks_per_node);
+				goto error;
+			}
+			while(tasks_per_node[i]!='x' && tasks_per_node[i])
+				i++;
+			i++;
+			if(tasks_per_node[i] >= '0' 
+			   && tasks_per_node[i] <= '9')
+				ai->cpu_count_reps[ai->num_cpu_groups] = 
+					atoi(&tasks_per_node[i]);
+			else {
+				error("1 problem with tasks_per_node %s", 
+				      tasks_per_node);
+				goto error;
+			}
+			while(tasks_per_node[i]!=',' && tasks_per_node[i])
+				i++;
+			if(tasks_per_node[i] == ',') {
+				i++;	
+			}
+			ai->num_cpu_groups++;
+		}
+		xfree(tasks_per_node);
+	} else {
+		uint32_t cpn = (opt.nprocs + ai->nnodes - 1) / ai->nnodes;
+		info("SLURM_TASKS_PER_NODE not set! "
+		     "Guessing %d cpus per node", cpn);
+		ai->cpus_per_node  = &cpn;
+		ai->cpu_count_reps = &ai->nnodes;
+	}
+	if(!opt.max_nodes)
+		opt.max_nodes = opt.min_nodes;
 	/* 
 	 * Create job, then fill in host addresses
 	 */
 	job = _job_create_structure(ai);
-	
+error:
    	xfree(ai);
 	return (job);
 
@@ -247,30 +324,34 @@ job_create_allocation(resource_allocation_response_msg_t *resp)
  * Create an srun job structure from a resource allocation response msg
  */
 static srun_job_t *
-_job_create_structure(allocation_info_t *info)
+_job_create_structure(allocation_info_t *ainfo)
 {
 	srun_job_t *job = xmalloc(sizeof(srun_job_t));
 	
-	_set_nprocs(info);
+	_set_nprocs(ainfo);
 	debug2("creating job with %d tasks", opt.nprocs);
 
 	slurm_mutex_init(&job->state_mutex);
 	pthread_cond_init(&job->state_cond, NULL);
 	job->state = SRUN_JOB_INIT;
 
- 	job->nodelist = xstrdup(info->nodelist); 
-	job->stepid  = info->stepid;
+ 	job->nodelist = xstrdup(ainfo->nodelist); 
+	job->stepid  = ainfo->stepid;
 	
 #ifdef HAVE_FRONT_END	/* Limited job step support */
 	opt.overcommit = true;
 	job->nhosts = 1;
 #else
-	job->nhosts   = info->nnodes;
+	job->nhosts   = ainfo->nnodes;
 #endif
-
-
-	job->select_jobinfo = info->select_jobinfo;
-	job->jobid   = info->jobid;
+	if(opt.min_nodes > job->nhosts) {
+		error("Only allocated %d nodes asked for %d",
+		      job->nhosts, opt.min_nodes);
+		return NULL;
+	}	
+
+	job->select_jobinfo = ainfo->select_jobinfo;
+	job->jobid   = ainfo->jobid;
 	
 	job->ntasks  = opt.nprocs;
 	job->task_prolog = xstrdup(opt.task_prolog);
@@ -488,17 +569,17 @@ _estimate_nports(int nclients, int cli_per_port)
 }
 
 static int
-_compute_task_count(allocation_info_t *info)
+_compute_task_count(allocation_info_t *ainfo)
 {
 	int i, cnt = 0;
 
 	if (opt.cpus_set) {
-		for (i = 0; i < info->num_cpu_groups; i++)
-			cnt += ( info->cpu_count_reps[i] *
-				 (info->cpus_per_node[i]/opt.cpus_per_task));
+		for (i = 0; i < ainfo->num_cpu_groups; i++)
+			cnt += ( ainfo->cpu_count_reps[i] *
+				 (ainfo->cpus_per_node[i]/opt.cpus_per_task));
 	}
 
-	return (cnt < info->nnodes) ? info->nnodes : cnt;
+	return (cnt < ainfo->nnodes) ? ainfo->nnodes : cnt;
 }
 
 static void
diff --git a/testsuite/expect/globals b/testsuite/expect/globals
index f773d74f2a7..6bbe8869b8c 100755
--- a/testsuite/expect/globals
+++ b/testsuite/expect/globals
@@ -718,7 +718,7 @@ proc default_partition {} {
     set name ""
     set fd [open "|$scontrol --all --oneliner show partition"]
     while {[gets $fd line] != -1} {
-	if {[regexp {^PartitionName=(\w+).*Default=YES} $line name]
+	if {[regexp {^PartitionName=(\w+).*Default=YES} $line frag name]
 	    == 1} {
 	    break
 	}
diff --git a/testsuite/expect/test1.87 b/testsuite/expect/test1.87
index cb98b2872ce..e5c7d914cde 100755
--- a/testsuite/expect/test1.87
+++ b/testsuite/expect/test1.87
@@ -267,37 +267,19 @@ if {$matches < 2} {
 
 #
 # Get two nodes relative (starting at) node 3
-# Since there is only one node left, we check for a warning message
+# Since there is only one node left, we check for a error message
 #
-set matches 0
-set warning 0
+set error 0
 send "$srun -l -N2 -n2 -O --relative=3 $bin_hostname\n"
 expect {
-	-re "Warning:" {
-		set warning 1
+	-re "error:" {
+		send_user "This error is expected, no worries\n"
+		incr error
 		exp_continue
 	}
 	-re "($number): ($alpha_numeric)" {
-		if { $host_num == 0 } {
-			if {[string compare $expect_out(2,string) $host_3] == 0} {
-				incr matches
-			} else {
-				send_user "\nFAILURE: wrong node responded\n"
-				set exit_code   1	
-			}
-		}
-		if { $host_num == 1 } {
-			if {[string compare $expect_out(2,string) $host_3] == 0} {
-				incr matches
-			} else {
-				send_user "\nFAILURE: wrong node responded\n"
-				set exit_code   1	
-			}
-		}
-		if { $host_num > 1 } {
-			send_user "\nFAILURE: too many tasks responded\n"
-			set exit_code   1	
-		}
+		send_user "\nFAILURE: running where we shouldn't be able to run\n"
+		set exit_code   1	
 		exp_continue
 	}
 	-re "Unable to contact" {
@@ -316,12 +298,9 @@ expect {
 		wait
 	}
 }
-if {$matches < 2} {
-	send_user "\nFAILURE: required nodes failed to respond\n"
-	set exit_code   1
-}
-if {$warning < 1} {
-	send_user "\nFAILURE: warning message missing for inconsistent node count\n"
+
+if {$error == 0} {
+	send_user "\nFAILURE: No error for inconsistent node count\n"
 	set exit_code   1
 }
 
-- 
GitLab