From 85c0df8f0afb1fd86aa89dddeb268fe8a5f6712c Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 2 Dec 2002 21:04:21 +0000 Subject: [PATCH] Record task count in job_step_info, pass task_count in job and job step initiation. Establish limit of MAX_TASKS_PER_NODE and add matching error code. --- doc/man/man1/srun.1 | 5 +++-- src/common/slurm_errno.c | 2 ++ src/common/slurm_errno.h | 1 + src/common/slurm_protocol_pack.c | 16 ++++++++++++-- src/common/slurm_protocol_pack.h | 4 ++-- src/slurmctld/controller.c | 6 +++-- src/slurmctld/job_mgr.c | 4 ++++ src/slurmctld/node_scheduler.c | 3 ++- src/slurmctld/slurmctld.h | 4 +++- src/slurmctld/step_mgr.c | 38 +++++++++++++++++++++++--------- src/srun/job.c | 7 +++++- src/srun/srun.c | 15 ++++++++++--- 12 files changed, 80 insertions(+), 25 deletions(-) diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 4ac42b02a87..987ff097024 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -74,9 +74,10 @@ initiation and higher system utilization, but lower application performance. \fB\-O\fR, \fB\-\-overcommit\fR overcommit resources. Normally, .B srun -will not allocate more than one process to a cpu. By specifying +will not allocate more than one process per cpu. By specifying \fB\-\-overcommit\fR you are explicitly allowing more than one process -per cpu. +per cpu. However no more than \fMAX_TASKS_PER_NODE\fR tasks are +permitted to execute per node. .TP \fB\-T\fR, \fB\-\-threads\fR=\fInthreads\fR Request that diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index e77bed6f12c..8e59cf34e6a 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -128,6 +128,8 @@ static slurm_errtab_t slurm_errtab[] = { "Task distribution specification invalid" }, { ESLURM_JOB_PENDING, "Job is pending execution" }, + { ESLURM_BAD_TASK_COUNT, + "Task count specification invalid" }, /* Quadrics Elan routine error codes */ diff --git a/src/common/slurm_errno.h b/src/common/slurm_errno.h index c922a81905c..50a05bdd786 100644 --- a/src/common/slurm_errno.h +++ b/src/common/slurm_errno.h @@ -85,6 +85,7 @@ enum { ESLURM_INTERCONNECT_FAILURE, ESLURM_BAD_DIST, ESLURM_JOB_PENDING, + ESLURM_BAD_TASK_COUNT, /* Quadrics Elan routine error codes */ ENOSLURM = 3000, diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 33fe3be375a..01dfde4eeef 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -1076,6 +1076,8 @@ _pack_job_step_create_request_msg(job_step_create_request_msg_t pack32(msg->user_id, buffer); pack32(msg->node_count, buffer); pack32(msg->cpu_count, buffer); + pack32(msg->num_tasks, buffer); + pack16(msg->relative, buffer); pack16(msg->task_dist, buffer); packstr(msg->node_list, buffer); @@ -1097,6 +1099,8 @@ _unpack_job_step_create_request_msg(job_step_create_request_msg_t ** msg, safe_unpack32(&(tmp_ptr->user_id), buffer); safe_unpack32(&(tmp_ptr->node_count), buffer); safe_unpack32(&(tmp_ptr->cpu_count), buffer); + safe_unpack32(&(tmp_ptr->num_tasks), buffer); + safe_unpack16(&(tmp_ptr->relative), buffer); safe_unpack16(&(tmp_ptr->task_dist), buffer); safe_unpackstr_xmalloc(&(tmp_ptr->node_list), &uint16_tmp, buffer); @@ -1337,12 +1341,15 @@ _unpack_partition_info_members(partition_info_t * part, Buf buffer) */ void pack_job_step_info_members(uint32_t job_id, uint16_t step_id, - uint32_t user_id, time_t start_time, - char *partition, char *nodes, Buf buffer) + uint32_t user_id, uint32_t num_tasks, + time_t start_time, char *partition, + char *nodes, Buf buffer) { pack32(job_id, buffer); pack16(step_id, buffer); pack32(user_id, buffer); + pack32(num_tasks, buffer); + pack_time(start_time, buffer); packstr(partition, buffer); packstr(nodes, buffer); @@ -1361,6 +1368,7 @@ pack_job_step_info(job_step_info_t * step, Buf buffer) pack_job_step_info_members(step->job_id, step->step_id, step->user_id, + step->num_tasks, step->start_time, step->partition, step->nodes, buffer); } @@ -1379,6 +1387,8 @@ _unpack_job_step_info_members(job_step_info_t * step, Buf buffer) safe_unpack32(&step->job_id, buffer); safe_unpack16(&step->step_id, buffer); safe_unpack32(&step->user_id, buffer); + safe_unpack32(&step->num_tasks, buffer); + safe_unpack_time(&step->start_time, buffer); safe_unpackstr_xmalloc(&step->partition, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&step->nodes, &uint16_tmp, buffer); @@ -1674,6 +1684,7 @@ _pack_job_desc_msg(job_desc_msg_t * job_desc_ptr, Buf buffer) pack32(job_desc_ptr->num_procs, buffer); pack32(job_desc_ptr->num_nodes, buffer); + pack32(job_desc_ptr->num_tasks, buffer); pack32(job_desc_ptr->user_id, buffer); } @@ -1723,6 +1734,7 @@ _unpack_job_desc_msg(job_desc_msg_t ** job_desc_buffer_ptr, Buf buffer) safe_unpack32(&job_desc_ptr->num_procs, buffer); safe_unpack32(&job_desc_ptr->num_nodes, buffer); + safe_unpack32(&job_desc_ptr->num_tasks, buffer); safe_unpack32(&job_desc_ptr->user_id, buffer); return SLURM_SUCCESS; diff --git a/src/common/slurm_protocol_pack.h b/src/common/slurm_protocol_pack.h index 53d23107a11..1a76d1bc4d3 100644 --- a/src/common/slurm_protocol_pack.h +++ b/src/common/slurm_protocol_pack.h @@ -151,7 +151,7 @@ void pack_job_step_info ( job_step_info_t* step, Buf buffer ); * automatically updated */ void pack_job_step_info_members( uint32_t job_id, uint16_t step_id, - uint32_t user_id, time_t start_time, char *partition, - char *nodes, Buf buffer ); + uint32_t user_id, uint32_t num_tasks, time_t start_time, + char *partition, char *nodes, Buf buffer ); #endif diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index c6dd05bc100..6526f471d4d 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -1497,9 +1497,11 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg) return; } - req_step_msg.job_id = job_id; - req_step_msg.user_id = job_desc_msg->user_id; + req_step_msg.job_id = job_id; + req_step_msg.user_id = job_desc_msg->user_id; req_step_msg.node_count = INFINITE; + req_step_msg.cpu_count = job_desc_msg->num_procs; + req_step_msg.num_tasks = job_desc_msg->num_tasks; error_code = step_create(&req_step_msg, &step_rec); /* note: no need to free step_rec, pointer to global job step record */ if (error_code) { diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 122bcedf909..808fddecdd4 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -438,6 +438,7 @@ static void _dump_job_step_state(struct step_record *step_ptr, Buf buffer) pack16((uint16_t) step_ptr->step_id, buffer); pack16((uint16_t) step_ptr->cyclic_alloc, buffer); + pack32(step_ptr->num_tasks, buffer); pack_time(step_ptr->start_time, buffer); node_list = bitmap2node_name(step_ptr->node_bitmap); packstr(node_list, buffer); @@ -687,11 +688,13 @@ int load_job_state(void) while (step_flag == STEP_FLAG) { struct step_record *step_ptr; uint16_t step_id, cyclic_alloc; + uint32_t num_tasks; time_t start_time; char *node_list; safe_unpack16(&step_id, buffer); safe_unpack16(&cyclic_alloc, buffer); + safe_unpack32(&num_tasks, buffer); safe_unpack_time(&start_time, buffer); safe_unpackstr_xmalloc(&node_list, &name_len, buffer); @@ -712,6 +715,7 @@ int load_job_state(void) break; step_ptr->step_id = step_id; step_ptr->cyclic_alloc = cyclic_alloc; + step_ptr->num_tasks = num_tasks; step_ptr->start_time = start_time; info("recovered job step %u.%u", job_id, step_id); if (node_list) { diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 2406254f5a4..d2d1ec87c13 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -65,7 +65,8 @@ static int _pick_best_nodes(struct node_set *node_set_ptr, static int _valid_features(char *requested, char *available); -/* allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED +/* + * allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED * IN bitmap - map of nodes to be allocated * globals: node_record_count - number of nodes in the system * node_record_table_ptr - pointer to global node table diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index c9354d1e135..fcaf15b2bfc 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -251,6 +251,7 @@ struct step_record { uint16_t step_id; /* step number */ uint16_t cyclic_alloc; /* set for cyclic task allocation across nodes */ + uint32_t num_tasks; /* number of tasks required */ time_t start_time; /* step allocation time */ bitstr_t *node_bitmap; /* bitmap of nodes allocated to job step */ @@ -266,7 +267,8 @@ extern List job_list; /* list of job_record entries */ * Global slurmctld functions \*****************************************************************************/ -/* allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED +/* + * allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED * IN bitmap - map of nodes to be allocated * globals: node_record_count - number of nodes in the system * node_record_table_ptr - pointer to global node table diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index bbf6d8d22ba..09d4f35f9c4 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -156,9 +156,9 @@ dump_step_desc(step_specs *step_spec) debug3("StepDesc: user_id=%u job_id=%u node_count=%u, cpu_count=%u", step_spec->user_id, step_spec->job_id, step_spec->node_count, step_spec->cpu_count); - debug3(" relative=%u task_dist=%u node_list=%s", - step_spec->relative, step_spec->task_dist, - step_spec->node_list); + debug3(" num_tasks=%u relative=%u task_dist=%u node_list=%s", + step_spec->num_tasks, step_spec->relative, + step_spec->task_dist, step_spec->node_list); } @@ -410,7 +410,7 @@ cleanup: /* * step_create - creates a step_record in step_specs->job_id, sets up the - * accoding to the step_specs. + * according to the step_specs. * IN step_specs - job step specifications * OUT new_step_record - pointer to the new step_record (NULL on error) * RET - 0 or error code @@ -423,9 +423,9 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record ) struct step_record *step_ptr; struct job_record *job_ptr; bitstr_t *nodeset; + int node_count; #ifdef HAVE_LIBELAN3 int first, last, i, node_id; - int nprocs = step_specs->cpu_count; int node_set_size = QSW_MAX_TASKS; /* overkill but safe */ #endif @@ -434,15 +434,14 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record ) if (job_ptr == NULL) return ESLURM_INVALID_JOB_ID ; - if (step_specs->user_id != job_ptr->user_id && - step_specs->user_id != 0) + if ((step_specs->user_id != job_ptr->user_id) && + (step_specs->user_id != 0)) return ESLURM_ACCESS_DENIED ; if ((job_ptr->job_state == JOB_COMPLETE) || (job_ptr->job_state == JOB_FAILED) || (job_ptr->job_state == JOB_TIMEOUT)) return ESLURM_ALREADY_DONE; - job_ptr->time_last_active = time(NULL); #ifdef HAVE_LIBELAN3 if ((step_specs->task_dist != SLURM_DIST_CYCLIC) && @@ -450,10 +449,25 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record ) return ESLURM_BAD_DIST; #endif - nodeset = _pick_step_nodes (job_ptr, step_specs ); - + job_ptr->time_last_active = time(NULL); + nodeset = _pick_step_nodes (job_ptr, step_specs); if (nodeset == NULL) return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE ; + node_count = bit_set_count(nodeset); + + if (step_specs->num_tasks == NO_VAL) { + if (step_specs->cpu_count != NO_VAL) + step_specs->num_tasks = step_specs->cpu_count; + else + step_specs->num_tasks = node_count; + } + if ((step_specs->num_tasks < 1) || + (step_specs->num_tasks > (node_count*MAX_TASKS_PER_NODE))) + return ESLURM_BAD_TASK_COUNT; +#ifdef HAVE_LIBELAN3 + if (step_specs->num_tasks > node_set_size) + return ESLURM_BAD_TASK_COUNT; +#endif step_ptr = create_step_record (job_ptr); if (step_ptr == NULL) @@ -463,6 +477,7 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record ) step_ptr->node_bitmap = nodeset; step_ptr->cyclic_alloc = (uint16_t) (step_specs->task_dist == SLURM_DIST_CYCLIC); + step_ptr->num_tasks = step_specs->num_tasks; #ifdef HAVE_LIBELAN3 if (qsw_alloc_jobinfo (&step_ptr->qsw_job) < 0) @@ -488,7 +503,7 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record ) } } } - if (qsw_setup_jobinfo (step_ptr->qsw_job, nprocs, + if (qsw_setup_jobinfo (step_ptr->qsw_job, step_specs->num_tasks, nodeset, step_ptr->cyclic_alloc) < 0) { error ("step_create: qsw_setup_jobinfo error %m"); delete_step_record (job_ptr, step_ptr->step_id); @@ -520,6 +535,7 @@ static void _pack_ctld_job_step_info(struct step_record *step, Buf buffer) pack_job_step_info_members(step->job_ptr->job_id, step->step_id, step->job_ptr->user_id, + step->num_tasks, step->start_time, step->job_ptr->partition, node_list, buffer); diff --git a/src/srun/job.c b/src/srun/job.c index 0ee8af349f2..ed50bf31ed3 100644 --- a/src/srun/job.c +++ b/src/srun/job.c @@ -137,7 +137,11 @@ job_create(resource_allocation_response_msg_t *resp) /* job->ntask[i] = 0; */ if (resp) { - job->cpus[i] = resp->cpus_per_node[cpu_inx]; + if (opt.overcommit) + job->cpus[i] = tph; + else + job->cpus[i] = resp->cpus_per_node[cpu_inx]; + if ((++cpu_cnt) >= resp->cpu_count_reps[cpu_inx]) { /* move to next record */ cpu_inx++; @@ -151,6 +155,7 @@ job_create(resource_allocation_response_msg_t *resp) slurm_set_addr (&job->slurmd_addr[i], slurm_get_slurmd_port(), job->host[i]); } + } return job; diff --git a/src/srun/srun.c b/src/srun/srun.c index 8203faef8c7..dc1483678f7 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -320,9 +320,14 @@ _allocate_nodes(void) job.req_nodes = opt.nodelist; - job.num_procs = opt.nprocs * opt.cpus_per_task; + if (opt.overcommit) + job.num_procs = opt.nodes; + else + job.num_procs = opt.nprocs * opt.cpus_per_task; - job.num_nodes = opt.nodes; + job.num_nodes = opt.nodes; + + job.num_tasks = opt.nprocs; job.user_id = opt.uid; @@ -432,7 +437,11 @@ _create_job_step(job_t *job) req.job_id = job->jobid; req.user_id = opt.uid; req.node_count = job->nhosts; - req.cpu_count = opt.nprocs * opt.cpus_per_task; + if (opt.overcommit) + req.cpu_count = job->nhosts; + else + req.cpu_count = opt.nprocs * opt.cpus_per_task; + req.num_tasks = opt.nprocs; req.node_list = job->nodelist; req.relative = false; if (opt.distribution == SRUN_DIST_BLOCK) -- GitLab