From 85c0df8f0afb1fd86aa89dddeb268fe8a5f6712c Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Mon, 2 Dec 2002 21:04:21 +0000
Subject: [PATCH] Record task count in job_step_info, pass task_count in job
 and job step initiation. Establish limit of MAX_TASKS_PER_NODE and add
 matching error code.

---
 doc/man/man1/srun.1              |  5 +++--
 src/common/slurm_errno.c         |  2 ++
 src/common/slurm_errno.h         |  1 +
 src/common/slurm_protocol_pack.c | 16 ++++++++++++--
 src/common/slurm_protocol_pack.h |  4 ++--
 src/slurmctld/controller.c       |  6 +++--
 src/slurmctld/job_mgr.c          |  4 ++++
 src/slurmctld/node_scheduler.c   |  3 ++-
 src/slurmctld/slurmctld.h        |  4 +++-
 src/slurmctld/step_mgr.c         | 38 +++++++++++++++++++++++---------
 src/srun/job.c                   |  7 +++++-
 src/srun/srun.c                  | 15 ++++++++++---
 12 files changed, 80 insertions(+), 25 deletions(-)

diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1
index 4ac42b02a87..987ff097024 100644
--- a/doc/man/man1/srun.1
+++ b/doc/man/man1/srun.1
@@ -74,9 +74,10 @@ initiation and higher system utilization, but lower application performance.
 \fB\-O\fR, \fB\-\-overcommit\fR
 overcommit resources. Normally,
 .B srun
-will not allocate more than one process to a cpu. By specifying
+will not allocate more than one process per cpu. By specifying
 \fB\-\-overcommit\fR you are explicitly allowing more than one process
-per cpu.
+per cpu. However no more than \fMAX_TASKS_PER_NODE\fR tasks are 
+permitted to execute per node.
 .TP
 \fB\-T\fR, \fB\-\-threads\fR=\fInthreads\fR
 Request that 
diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c
index e77bed6f12c..8e59cf34e6a 100644
--- a/src/common/slurm_errno.c
+++ b/src/common/slurm_errno.c
@@ -128,6 +128,8 @@ static slurm_errtab_t slurm_errtab[] = {
 	  "Task distribution specification invalid"		},
 	{ ESLURM_JOB_PENDING, 
 	  "Job is pending execution"				},
+	{ ESLURM_BAD_TASK_COUNT, 
+	  "Task count specification invalid"			},
 
 	/* Quadrics Elan routine error codes */
 
diff --git a/src/common/slurm_errno.h b/src/common/slurm_errno.h
index c922a81905c..50a05bdd786 100644
--- a/src/common/slurm_errno.h
+++ b/src/common/slurm_errno.h
@@ -85,6 +85,7 @@ enum {
 	ESLURM_INTERCONNECT_FAILURE,
 	ESLURM_BAD_DIST,
 	ESLURM_JOB_PENDING,
+	ESLURM_BAD_TASK_COUNT,
 
 	/* Quadrics Elan routine error codes */
 	ENOSLURM =					3000,
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index 33fe3be375a..01dfde4eeef 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -1076,6 +1076,8 @@ _pack_job_step_create_request_msg(job_step_create_request_msg_t
 	pack32(msg->user_id, buffer);
 	pack32(msg->node_count, buffer);
 	pack32(msg->cpu_count, buffer);
+	pack32(msg->num_tasks, buffer);
+
 	pack16(msg->relative, buffer);
 	pack16(msg->task_dist, buffer);
 	packstr(msg->node_list, buffer);
@@ -1097,6 +1099,8 @@ _unpack_job_step_create_request_msg(job_step_create_request_msg_t ** msg,
 	safe_unpack32(&(tmp_ptr->user_id), buffer);
 	safe_unpack32(&(tmp_ptr->node_count), buffer);
 	safe_unpack32(&(tmp_ptr->cpu_count), buffer);
+	safe_unpack32(&(tmp_ptr->num_tasks), buffer);
+
 	safe_unpack16(&(tmp_ptr->relative), buffer);
 	safe_unpack16(&(tmp_ptr->task_dist), buffer);
 	safe_unpackstr_xmalloc(&(tmp_ptr->node_list), &uint16_tmp, buffer);
@@ -1337,12 +1341,15 @@ _unpack_partition_info_members(partition_info_t * part, Buf buffer)
  */
 void
 pack_job_step_info_members(uint32_t job_id, uint16_t step_id,
-			   uint32_t user_id, time_t start_time,
-			   char *partition, char *nodes, Buf buffer)
+			   uint32_t user_id, uint32_t num_tasks,
+			   time_t start_time, char *partition, 
+			   char *nodes, Buf buffer)
 {
 	pack32(job_id, buffer);
 	pack16(step_id, buffer);
 	pack32(user_id, buffer);
+	pack32(num_tasks, buffer);
+
 	pack_time(start_time, buffer);
 	packstr(partition, buffer);
 	packstr(nodes, buffer);
@@ -1361,6 +1368,7 @@ pack_job_step_info(job_step_info_t * step, Buf buffer)
 	pack_job_step_info_members(step->job_id,
 				   step->step_id,
 				   step->user_id,
+				   step->num_tasks,
 				   step->start_time,
 				   step->partition, step->nodes, buffer);
 }
@@ -1379,6 +1387,8 @@ _unpack_job_step_info_members(job_step_info_t * step, Buf buffer)
 	safe_unpack32(&step->job_id, buffer);
 	safe_unpack16(&step->step_id, buffer);
 	safe_unpack32(&step->user_id, buffer);
+	safe_unpack32(&step->num_tasks, buffer);
+
 	safe_unpack_time(&step->start_time, buffer);
 	safe_unpackstr_xmalloc(&step->partition, &uint16_tmp, buffer);
 	safe_unpackstr_xmalloc(&step->nodes, &uint16_tmp, buffer);
@@ -1674,6 +1684,7 @@ _pack_job_desc_msg(job_desc_msg_t * job_desc_ptr, Buf buffer)
 
 	pack32(job_desc_ptr->num_procs, buffer);
 	pack32(job_desc_ptr->num_nodes, buffer);
+	pack32(job_desc_ptr->num_tasks, buffer);
 	pack32(job_desc_ptr->user_id, buffer);
 
 }
@@ -1723,6 +1734,7 @@ _unpack_job_desc_msg(job_desc_msg_t ** job_desc_buffer_ptr, Buf buffer)
 
 	safe_unpack32(&job_desc_ptr->num_procs, buffer);
 	safe_unpack32(&job_desc_ptr->num_nodes, buffer);
+	safe_unpack32(&job_desc_ptr->num_tasks, buffer);
 	safe_unpack32(&job_desc_ptr->user_id, buffer);
 
 	return SLURM_SUCCESS;
diff --git a/src/common/slurm_protocol_pack.h b/src/common/slurm_protocol_pack.h
index 53d23107a11..1a76d1bc4d3 100644
--- a/src/common/slurm_protocol_pack.h
+++ b/src/common/slurm_protocol_pack.h
@@ -151,7 +151,7 @@ void pack_job_step_info ( job_step_info_t* step, Buf buffer );
  *			automatically updated
  */ 
 void pack_job_step_info_members( uint32_t job_id, uint16_t step_id, 
-		uint32_t user_id, time_t start_time, char *partition, 
-		char *nodes, Buf buffer );
+		uint32_t user_id, uint32_t num_tasks, time_t start_time, 
+		char *partition, char *nodes, Buf buffer );
 
 #endif
diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index c6dd05bc100..6526f471d4d 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -1497,9 +1497,11 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 		return;
 	}
 
-	req_step_msg.job_id = job_id;
-	req_step_msg.user_id = job_desc_msg->user_id;
+	req_step_msg.job_id     = job_id;
+	req_step_msg.user_id    = job_desc_msg->user_id;
 	req_step_msg.node_count = INFINITE;
+	req_step_msg.cpu_count  = job_desc_msg->num_procs;
+	req_step_msg.num_tasks  = job_desc_msg->num_tasks;
 	error_code = step_create(&req_step_msg, &step_rec);
 	/* note: no need to free step_rec, pointer to global job step record */
 	if (error_code) {
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 122bcedf909..808fddecdd4 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -438,6 +438,7 @@ static void _dump_job_step_state(struct step_record *step_ptr, Buf buffer)
 
 	pack16((uint16_t) step_ptr->step_id, buffer);
 	pack16((uint16_t) step_ptr->cyclic_alloc, buffer);
+	pack32(step_ptr->num_tasks, buffer);
 	pack_time(step_ptr->start_time, buffer);
 	node_list = bitmap2node_name(step_ptr->node_bitmap);
 	packstr(node_list, buffer);
@@ -687,11 +688,13 @@ int load_job_state(void)
 		while (step_flag == STEP_FLAG) {
 			struct step_record *step_ptr;
 			uint16_t step_id, cyclic_alloc;
+			uint32_t num_tasks;
 			time_t start_time;
 			char *node_list;
 
 			safe_unpack16(&step_id, buffer);
 			safe_unpack16(&cyclic_alloc, buffer);
+			safe_unpack32(&num_tasks, buffer);
 			safe_unpack_time(&start_time, buffer);
 			safe_unpackstr_xmalloc(&node_list, &name_len,
 					       buffer);
@@ -712,6 +715,7 @@ int load_job_state(void)
 				break;
 			step_ptr->step_id = step_id;
 			step_ptr->cyclic_alloc = cyclic_alloc;
+			step_ptr->num_tasks = num_tasks;
 			step_ptr->start_time = start_time;
 			info("recovered job step %u.%u", job_id, step_id);
 			if (node_list) {
diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c
index 2406254f5a4..d2d1ec87c13 100644
--- a/src/slurmctld/node_scheduler.c
+++ b/src/slurmctld/node_scheduler.c
@@ -65,7 +65,8 @@ static int _pick_best_nodes(struct node_set *node_set_ptr,
 static int _valid_features(char *requested, char *available);
 
 
-/* allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED
+/*
+ * allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED
  * IN bitmap - map of nodes to be allocated
  * globals: node_record_count - number of nodes in the system
  *	node_record_table_ptr - pointer to global node table
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index c9354d1e135..fcaf15b2bfc 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -251,6 +251,7 @@ struct 	step_record {
 	uint16_t step_id;		/* step number */
 	uint16_t cyclic_alloc;		/* set for cyclic task allocation 
 					   across nodes */
+	uint32_t num_tasks;		/* number of tasks required */
 	time_t start_time;      	/* step allocation time */
 	bitstr_t *node_bitmap;		/* bitmap of nodes allocated to job 
 					   step */
@@ -266,7 +267,8 @@ extern List job_list;			/* list of job_record entries */
  *  Global slurmctld functions
 \*****************************************************************************/
 
-/* allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED
+/*
+ * allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED
  * IN bitmap - map of nodes to be allocated
  * globals: node_record_count - number of nodes in the system
  *	node_record_table_ptr - pointer to global node table
diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index bbf6d8d22ba..09d4f35f9c4 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -156,9 +156,9 @@ dump_step_desc(step_specs *step_spec)
 	debug3("StepDesc: user_id=%u job_id=%u node_count=%u, cpu_count=%u", 
 		step_spec->user_id, step_spec->job_id, 
 		step_spec->node_count, step_spec->cpu_count);
-	debug3("   relative=%u task_dist=%u node_list=%s", 
-		step_spec->relative, step_spec->task_dist, 
-		step_spec->node_list);
+	debug3("   num_tasks=%u relative=%u task_dist=%u node_list=%s", 
+		step_spec->num_tasks, step_spec->relative, 
+		step_spec->task_dist, step_spec->node_list);
 }
 
 
@@ -410,7 +410,7 @@ cleanup:
 
 /*
  * step_create - creates a step_record in step_specs->job_id, sets up the
- *	accoding to the step_specs.
+ *	according to the step_specs.
  * IN step_specs - job step specifications
  * OUT new_step_record - pointer to the new step_record (NULL on error)
  * RET - 0 or error code
@@ -423,9 +423,9 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record  )
 	struct step_record *step_ptr;
 	struct job_record  *job_ptr;
 	bitstr_t *nodeset;
+	int node_count;
 #ifdef HAVE_LIBELAN3
 	int first, last, i, node_id;
-	int nprocs = step_specs->cpu_count;
 	int node_set_size = QSW_MAX_TASKS; /* overkill but safe */
 #endif
 
@@ -434,15 +434,14 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record  )
 	if (job_ptr == NULL)
 		return ESLURM_INVALID_JOB_ID ;
 
-	if (step_specs->user_id != job_ptr->user_id &&
-	    	step_specs->user_id != 0)
+	if ((step_specs->user_id != job_ptr->user_id) &&
+	    (step_specs->user_id != 0))
 		return ESLURM_ACCESS_DENIED ;
 
 	if ((job_ptr->job_state == JOB_COMPLETE) || 
 	    (job_ptr->job_state == JOB_FAILED) ||
 	    (job_ptr->job_state == JOB_TIMEOUT))
 		return ESLURM_ALREADY_DONE;
-	job_ptr->time_last_active = time(NULL);
 
 #ifdef HAVE_LIBELAN3
 	if ((step_specs->task_dist != SLURM_DIST_CYCLIC) &&
@@ -450,10 +449,25 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record  )
 		return ESLURM_BAD_DIST;
 #endif
 
-	nodeset = _pick_step_nodes (job_ptr, step_specs );
-
+	job_ptr->time_last_active = time(NULL);
+	nodeset = _pick_step_nodes (job_ptr, step_specs);
 	if (nodeset == NULL)
 		return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE ;
+	node_count = bit_set_count(nodeset);
+
+	if (step_specs->num_tasks == NO_VAL) {
+		if (step_specs->cpu_count != NO_VAL)
+			step_specs->num_tasks = step_specs->cpu_count;
+		else
+			step_specs->num_tasks = node_count;
+	}
+	if ((step_specs->num_tasks < 1) ||
+	    (step_specs->num_tasks > (node_count*MAX_TASKS_PER_NODE)))
+		return ESLURM_BAD_TASK_COUNT;
+#ifdef HAVE_LIBELAN3
+	if (step_specs->num_tasks > node_set_size)
+		return ESLURM_BAD_TASK_COUNT;
+#endif
 
 	step_ptr = create_step_record (job_ptr);
 	if (step_ptr == NULL)
@@ -463,6 +477,7 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record  )
 	step_ptr->node_bitmap = nodeset;
 	step_ptr->cyclic_alloc = 
 		(uint16_t) (step_specs->task_dist == SLURM_DIST_CYCLIC);
+	step_ptr->num_tasks = step_specs->num_tasks;
 
 #ifdef HAVE_LIBELAN3
 	if (qsw_alloc_jobinfo (&step_ptr->qsw_job) < 0)
@@ -488,7 +503,7 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record  )
 			}
 		}
 	}
-	if (qsw_setup_jobinfo (step_ptr->qsw_job, nprocs, 
+	if (qsw_setup_jobinfo (step_ptr->qsw_job, step_specs->num_tasks, 
 				nodeset, step_ptr->cyclic_alloc) < 0) {
 		error ("step_create: qsw_setup_jobinfo error %m");
 		delete_step_record (job_ptr, step_ptr->step_id);
@@ -520,6 +535,7 @@ static void _pack_ctld_job_step_info(struct step_record *step, Buf buffer)
 	pack_job_step_info_members(step->job_ptr->job_id,
 				   step->step_id,
 				   step->job_ptr->user_id,
+				   step->num_tasks,
 				   step->start_time,
 				   step->job_ptr->partition,
 				   node_list, buffer);
diff --git a/src/srun/job.c b/src/srun/job.c
index 0ee8af349f2..ed50bf31ed3 100644
--- a/src/srun/job.c
+++ b/src/srun/job.c
@@ -137,7 +137,11 @@ job_create(resource_allocation_response_msg_t *resp)
 		/* job->ntask[i] = 0; */
 
 		if (resp) {
-			job->cpus[i] = resp->cpus_per_node[cpu_inx];
+			if (opt.overcommit)
+				job->cpus[i] = tph;
+			else
+				job->cpus[i] = resp->cpus_per_node[cpu_inx];
+
 			if ((++cpu_cnt) >= resp->cpu_count_reps[cpu_inx]) {
 				/* move to next record */
 				cpu_inx++;
@@ -151,6 +155,7 @@ job_create(resource_allocation_response_msg_t *resp)
 			slurm_set_addr (&job->slurmd_addr[i], 
 					slurm_get_slurmd_port(), job->host[i]);
 		}
+
 	}
 
 	return job;
diff --git a/src/srun/srun.c b/src/srun/srun.c
index 8203faef8c7..dc1483678f7 100644
--- a/src/srun/srun.c
+++ b/src/srun/srun.c
@@ -320,9 +320,14 @@ _allocate_nodes(void)
 
 	job.req_nodes      = opt.nodelist;
 
-	job.num_procs      = opt.nprocs * opt.cpus_per_task;
+	if (opt.overcommit)
+		job.num_procs      = opt.nodes;
+	else
+		job.num_procs      = opt.nprocs * opt.cpus_per_task;
 
-	job.num_nodes = opt.nodes;
+	job.num_nodes      = opt.nodes;
+
+	job.num_tasks      = opt.nprocs;
 
 	job.user_id        = opt.uid;
 
@@ -432,7 +437,11 @@ _create_job_step(job_t *job)
 	req.job_id     = job->jobid;
 	req.user_id    = opt.uid;
 	req.node_count = job->nhosts;
-	req.cpu_count  = opt.nprocs * opt.cpus_per_task;
+	if (opt.overcommit)
+		req.cpu_count  = job->nhosts;
+	else
+		req.cpu_count  = opt.nprocs * opt.cpus_per_task;
+	req.num_tasks  = opt.nprocs;
 	req.node_list  = job->nodelist;
 	req.relative   = false;
 	if (opt.distribution == SRUN_DIST_BLOCK)
-- 
GitLab