From 6ab3bfd560e253afa8746c3957d6eb9218edb49c Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 11 Nov 2008 23:13:45 +0000
Subject: [PATCH] svn merge -r15647:15652
 https://eris.llnl.gov/svn/slurm/branches/slurm-1.3

---
 NEWS                          |  3 ++
 doc/man/man1/srun.1           |  3 ++
 src/common/env.c              | 96 ++++++++++++++++++++---------------
 src/slurmctld/job_scheduler.c |  5 +-
 src/slurmctld/slurmctld.h     |  1 +
 src/slurmctld/step_mgr.c      | 28 ++++++----
 src/srun/opt.c                |  4 +-
 7 files changed, 85 insertions(+), 55 deletions(-)

diff --git a/NEWS b/NEWS
index c60dc187012..7d1cdfe8c76 100644
--- a/NEWS
+++ b/NEWS
@@ -103,6 +103,9 @@ documents those changes that are of interest to users and admins.
  -- Propagate --cpus-per-task parameter from salloc or sbatch input line to
     the SLURM_CPUS_PER_TASK environment variable in the spawned shell for 
     srun to use.
+ -- Add support for srun --cpus-per-task=0. This can be used to spawn tasks
+    without allocating resouces for the job step from the job's allocation
+    when running multiple job steps with the --exclusive option.
  
 * Changes in SLURM 1.3.10
 =========================
diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1
index d3d35a154ec..af7e88547f7 100644
--- a/doc/man/man1/srun.1
+++ b/doc/man/man1/srun.1
@@ -197,6 +197,9 @@ per node, a job request for 4 nodes and 3 CPUs per task may be
 allocated 3 or 6 CPUs per node (1 or 2 tasks per node) depending 
 upon resource consumption by other jobs. Such a job may be 
 unable to execute more than a total of 4 tasks. 
+This option may also be useful to spawn tasks without allocating
+resources to the job step from the job's allocation when running 
+multiple job steps with the \fB\-\-exclusive\fR option.
 
 .TP
 \fB\-\-comment\fR=<\fIstring\fR>
diff --git a/src/common/env.c b/src/common/env.c
index 74c51e93cf1..2674a8e4d9f 100644
--- a/src/common/env.c
+++ b/src/common/env.c
@@ -782,6 +782,7 @@ env_array_for_job(char ***dest, const resource_allocation_response_msg_t *alloc,
 {
 	char *bgl_part_id = NULL, *tmp;
 	slurm_step_layout_t *step_layout = NULL;
+	uint32_t num_tasks = desc->num_tasks;
 
 	env_array_overwrite_fmt(dest, "SLURM_JOB_ID", "%u", alloc->job_id);
 	env_array_overwrite_fmt(dest, "SLURM_JOB_NUM_NODES", "%u",
@@ -815,23 +816,37 @@ env_array_for_job(char ***dest, const resource_allocation_response_msg_t *alloc,
 	env_array_overwrite_fmt(dest, "SLURM_NNODES", "%u", alloc->node_cnt);
 	env_array_overwrite_fmt(dest, "SLURM_NODELIST", "%s", alloc->node_list);
 	
-	if(desc->num_tasks != NO_VAL) {
+	if(num_tasks == NO_VAL) {
 		/* If we know how many tasks we are going to do then
 		   we set SLURM_TASKS_PER_NODE */
-		step_layout = slurm_step_layout_create(alloc->node_list,
-						       alloc->cpus_per_node,
-						       alloc->cpu_count_reps,
-						       alloc->node_cnt,
-						       desc->num_tasks,
-						       desc->cpus_per_task,
-						       desc->task_dist,
-						       desc->plane_size);
-		tmp = _uint16_array_to_str(step_layout->node_cnt,
-					   step_layout->tasks);
-		slurm_step_layout_destroy(step_layout);
-		env_array_overwrite_fmt(dest, "SLURM_TASKS_PER_NODE", 
-					"%s", tmp);
+		int i=0;
+		/* If no tasks were given we can figure it out here
+		 * by totalling up the cpus and then dividing by the
+		 * number of cpus per task */
+		
+		num_tasks = 0;
+		for (i = 0; i < alloc->num_cpu_groups; i++) {
+			num_tasks += alloc->cpu_count_reps[i] 
+				* alloc->cpus_per_node[i];
+		}
+		if((int)desc->cpus_per_task > 1 
+		   && desc->cpus_per_task != (uint16_t)NO_VAL)
+			num_tasks /= desc->cpus_per_task;
+		//num_tasks = desc->num_procs;
 	}
+	//info("got %d and %d", num_tasks,  desc->cpus_per_task);
+	step_layout = slurm_step_layout_create(alloc->node_list,
+					       alloc->cpus_per_node,
+					       alloc->cpu_count_reps,
+					       alloc->node_cnt,
+					       num_tasks,
+					       desc->cpus_per_task,
+					       desc->task_dist,
+					       desc->plane_size);
+	tmp = _uint16_array_to_str(step_layout->node_cnt,
+				   step_layout->tasks);
+	slurm_step_layout_destroy(step_layout);
+	env_array_overwrite_fmt(dest, "SLURM_TASKS_PER_NODE", "%s", tmp);
 	xfree(tmp);
 }
 
@@ -862,12 +877,17 @@ extern void
 env_array_for_batch_job(char ***dest, const batch_job_launch_msg_t *batch,
 			const char *node_name)
 {
-	char *tmp;
+	char *tmp = getenvp(batch->environment, "SLURM_CPUS_PER_TASK");
 	uint32_t num_nodes = 0;
 	uint32_t num_cpus = 0;
 	int i;
 	slurm_step_layout_t *step_layout = NULL;
+	int cpus_per_task = 1;
+	uint32_t num_tasks = batch->nprocs;
 
+	if(tmp) 
+		cpus_per_task = atoi(tmp);
+	
 	/* There is no explicit node count in the batch structure,
 	 * so we need to calculate the node count. We also need to
 	 * figure out the explicit cpu count so we can figure out the
@@ -898,34 +918,26 @@ env_array_for_batch_job(char ***dest, const batch_job_launch_msg_t *batch,
 	env_array_overwrite_fmt(dest, "SLURM_JOBID", "%u", batch->job_id);
 	env_array_overwrite_fmt(dest, "SLURM_NNODES", "%u", num_nodes);
 	env_array_overwrite_fmt(dest, "SLURM_NODELIST", "%s", batch->nodes);
-
-	if(batch->nprocs) {
-		/* we can figure out the cpus_per_task here by
-		 * reversing what happens in sbatch */
-		int cpus_per_task = num_cpus / batch->nprocs;
-/* 		info(" we have %u / %u = %u", num_cpus, */
-/* 		     batch->nprocs, cpus_per_task); */
-		if(cpus_per_task < 1)
-			cpus_per_task = 1;
-
+	if(num_tasks) 
 		env_array_overwrite_fmt(dest, "SLURM_NPROCS", "%u", 
-					batch->nprocs);
-		step_layout = slurm_step_layout_create(batch->nodes,
-						       batch->cpus_per_node,
-						       batch->cpu_count_reps,
-						       num_nodes,
-						       batch->nprocs,
-						       (uint16_t)cpus_per_task,
-						       (uint16_t)
-						       SLURM_DIST_BLOCK,
-						       (uint16_t)NO_VAL);
-		tmp = _uint16_array_to_str(step_layout->node_cnt,
-					   step_layout->tasks);
-		slurm_step_layout_destroy(step_layout);
-		env_array_overwrite_fmt(dest, "SLURM_TASKS_PER_NODE", 
-					"%s", tmp);
-		xfree(tmp);
-	}
+					num_tasks);
+	else 
+		num_tasks = num_cpus / cpus_per_task;
+	
+	step_layout = slurm_step_layout_create(batch->nodes,
+					       batch->cpus_per_node,
+					       batch->cpu_count_reps,
+					       num_nodes,
+					       num_tasks,
+					       (uint16_t)cpus_per_task,
+					       (uint16_t)
+					       SLURM_DIST_BLOCK,
+					       (uint16_t)NO_VAL);
+	tmp = _uint16_array_to_str(step_layout->node_cnt,
+				   step_layout->tasks);
+	slurm_step_layout_destroy(step_layout);
+	env_array_overwrite_fmt(dest, "SLURM_TASKS_PER_NODE", "%s", tmp);
+	xfree(tmp);
 }
 
 /*
diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c
index 7e0e0ae8a9f..8e33269e841 100644
--- a/src/slurmctld/job_scheduler.c
+++ b/src/slurmctld/job_scheduler.c
@@ -85,7 +85,7 @@ static char **	_xduparray(uint16_t size, char ** array);
  * RET number of entries in job_queue
  * NOTE: the buffer at *job_queue must be xfreed by the caller
  */
-static int _build_user_job_list(uint32_t user_id,char* job_name,
+static int _build_user_job_list(uint32_t user_id, char* job_name,
 			        struct job_queue **job_queue)
 {
 	ListIterator job_iterator;
@@ -102,7 +102,8 @@ static int _build_user_job_list(uint32_t user_id,char* job_name,
 		xassert (job_ptr->magic == JOB_MAGIC);
 		if (job_ptr->user_id != user_id)
 			continue;
-		if (job_name && strcmp(job_name,job_ptr->name))
+		if (job_name && job_ptr->name &&
+		    strcmp(job_name, job_ptr->name))
 			continue;
 		if (job_buffer_size <= job_queue_size) {
 			job_buffer_size += 200;
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index b035063953c..7c63a03c0a9 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -478,6 +478,7 @@ struct 	step_record {
 					 * step relative to job's nodes, 
 					 * see src/common/select_job_res.h */
 	uint32_t cpu_count;		/* count of step's CPUs */
+	uint16_t cpus_per_task;		/* cpus per task initiated */
 	uint16_t cyclic_alloc;		/* set for cyclic task allocation 
 					   across nodes */
 	uint16_t exclusive;		/* dedicated resources for the step */
diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index 8d48d681167..6097480c58f 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -506,7 +506,8 @@ _pick_step_nodes (struct job_record  *job_ptr,
 				tot_tasks = MIN(tot_tasks, usable_mem);
 			}
 			if ((avail_tasks <= 0) ||
-			    (cpus_picked_cnt >= step_spec->cpu_count))
+			    ((cpus_picked_cnt > 0) &&
+			     (cpus_picked_cnt >= step_spec->cpu_count)))
 				bit_clear(nodes_avail, i);
 			else
 				cpus_picked_cnt += avail_tasks;
@@ -907,8 +908,10 @@ extern void step_alloc_lps(struct step_record *step_ptr)
 		step_node_inx++;
 		if (job_node_inx >= select_ptr->nhosts)
 			fatal("step_alloc_lps: node index bad");
-		select_ptr->cpus_used[job_node_inx] += 
-			step_ptr->step_layout->tasks[step_node_inx];
+		if (step_ptr->cpus_per_task) {
+			select_ptr->cpus_used[job_node_inx] += 
+				step_ptr->step_layout->tasks[step_node_inx];
+		}
 		if (step_ptr->mem_per_task) {
 			select_ptr->memory_used[job_node_inx] += 
 				(step_ptr->mem_per_task *
@@ -970,8 +973,10 @@ static void _step_dealloc_lps(struct step_record *step_ptr)
 		step_node_inx++;
 		if (job_node_inx >= select_ptr->nhosts)
 			fatal("_step_dealloc_lps: node index bad");
-		if (select_ptr->cpus_used[job_node_inx] >=
-		    step_ptr->step_layout->tasks[step_node_inx]) {
+		if (step_ptr->cpus_per_task == 0)
+			;	/* no CPUs allocated */
+		else if (select_ptr->cpus_used[job_node_inx] >=
+			 step_ptr->step_layout->tasks[step_node_inx]) {
 			select_ptr->cpus_used[job_node_inx] -= 
 				step_ptr->step_layout->tasks[step_node_inx];
 		} else {
@@ -1192,13 +1197,17 @@ step_create(job_step_create_request_msg_t *step_specs,
 	/* a batch script does not need switch info */
 	if (!batch_step) {
 		/* we can figure out the cpus_per_task here by
-		   reversing what happens in srun */
+		 * reversing what happens in srun, record
+		 * argument, plus save/restore in slurm v1.4 */
 		int cpus_per_task = step_specs->cpu_count / 
 			step_specs->num_tasks;
-/* 		info(" we have %u / %u = %u", step_specs->cpu_count, */
-/* 		     step_specs->num_tasks, cpus_per_task); */
-		if(cpus_per_task < 1)
+		if (cpus_per_task < 1)
 			cpus_per_task = 1;
+		if (step_specs->cpu_count)
+			step_ptr->cpus_per_task = cpus_per_task;
+		else
+			step_ptr->cpus_per_task = 0;
+
 		step_ptr->step_layout = 
 			step_layout_create(step_ptr,
 					   step_node_list,
@@ -2112,6 +2121,7 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer)
 	step_ptr->pre_sus_time = pre_sus_time;
 	step_ptr->tot_sus_time = tot_sus_time;
 	step_ptr->ckpt_time    = ckpt_time;
+	step_ptr->cpus_per_task = 1;	/* Need to save/restore in v1.4 */
 
 	slurm_step_layout_destroy(step_ptr->step_layout);
 	step_ptr->step_layout  = step_layout;
diff --git a/src/srun/opt.c b/src/srun/opt.c
index 3d54cf214e6..f32bc21fc38 100644
--- a/src/srun/opt.c
+++ b/src/srun/opt.c
@@ -822,7 +822,7 @@ static void set_options(const int argc, char **argv)
 		case (int)'c':
 			opt.cpus_set = true;
 			opt.cpus_per_task = 
-				_get_int(optarg, "cpus-per-task", true);
+				_get_int(optarg, "cpus-per-task", false);
 			break;
 		case (int)'C':
 			xfree(opt.constraints);
@@ -1550,7 +1550,7 @@ static bool _opt_verify(void)
 		verified = false;
 	}
 
-	if (opt.cpus_per_task <= 0) {
+	if (opt.cpus_per_task < 0) {
 		error("%s: invalid number of cpus per task (-c %d)\n",
 		      opt.progname, opt.cpus_per_task);
 		verified = false;
-- 
GitLab