diff --git a/NEWS b/NEWS index d307f23d66fe524c1fbdd8ddaed6de063bbe4a6d..0440069b1d093bd7a0952344b1019992fa815819 100644 --- a/NEWS +++ b/NEWS @@ -56,6 +56,15 @@ documents those changes that are of interest to users and admins. command is issued. -- BLUEGENE - Fix for -n option to work on correct cpu counts for each midplane instead of treating -n as a c-node count. + -- salloc now sets SLURN_NTASKS_PER_NODE if --ntasks-per-node option is set. + -- Fix select/linear to properly set a job's count of allocated processors + (all processors on the allocated nodes). + -- Fix select/cons_res to allocate proper CPU count when --ntasks-per-node + option is used. + -- Insure that no node is allocated to a job for which the CPU count is less + than --ntasks-per-node * --cpus-per-task. + -- Correct AllocProcs reported by "scontrol show node" when ThreadsPerCore + is greater than 1 and select/cons_res is used. * Changes in SLURM 2.0.1 ======================== diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index f896961d50be93af9ebfeb9435178e5daffab4dc..2535bde4194ca5cd692c971595f9b57a7cfecaea 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -932,6 +932,9 @@ Total number of nodes in the job allocation. \fBSLURM_MEM_BIND\fR Set to value of the \-\-mem_bind\fR option. .TP +\fBSLURM_NTASKS_PER_NODE\fR +Set to value of the \-\-ntasks\-per\-node\fR option, if specified. +.TP \fBSLURM_TASKS_PER_NODE\fR Number of tasks to be initiated on each node. Values are comma separated and in the same order as SLURM_NODELIST. diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 2ca72fb075ed12a3bc6b8f165e850e8fe9b7afa0..1d0641ec74f60c1dea62db38de66cc2aa1ac5d89 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -1327,13 +1327,13 @@ Same as \fB\-\-network\fR \fBSLURM_NNODES\fR Same as \fB\-N, \-\-nodes\fR .TP -\fBSLURN_NTASKS_PER_CORE\fR +\fBSLURM_NTASKS_PER_CORE\fR Same as \fB\-\-ntasks\-per\-core\fR .TP -\fBSLURN_NTASKS_PER_NODE\fR +\fBSLURM_NTASKS_PER_NODE\fR Same as \fB\-\-ntasks\-per\-node\fR .TP -\fBSLURN_NTASKS_PER_SOCKET\fR +\fBSLURM_NTASKS_PER_SOCKET\fR Same as \fB\-\-ntasks\-per\-socket\fR .TP \fBSLURM_NO_ROTATE\fR diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c index 9c0f074a7cf5528dda4236bceec77a8182ea3c14..f4d0a34d0f25a2f4ae1c78794558033689b35d96 100644 --- a/src/plugins/select/cons_res/job_test.c +++ b/src/plugins/select/cons_res/job_test.c @@ -2016,7 +2016,10 @@ alloc_job: if (job_res->node_bitmap == NULL) fatal("bit_copy malloc failure"); job_res->nhosts = bit_set_count(bitmap); - job_res->nprocs = MAX(job_ptr->num_procs, job_res->nhosts); + job_res->nprocs = job_res->nhosts; + if (job_ptr->details->ntasks_per_node) + job_res->nprocs *= job_ptr->details->ntasks_per_node; + job_res->nprocs = MAX(job_res->nprocs, job_ptr->num_procs); job_res->node_req = job_node_req; job_res->cpus = cpu_count; job_res->cpus_used = xmalloc(job_res->nhosts * sizeof(uint16_t)); diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 85ed16d79cdb8bc51df73ae01d2fecd02528dd9e..14a4754daadead8ca301b5070982e6a1975db5c1 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -1615,6 +1615,7 @@ extern int select_p_select_nodeinfo_set_all(void) int i=0, n=0, c, start, end; uint16_t tmp, tmp_16 = 0; static time_t last_set_all = 0; + uint32_t node_threads, node_cpus; /* only set this once when the last_node_update is newer than the last time we set things up. */ @@ -1628,6 +1629,13 @@ extern int select_p_select_nodeinfo_set_all(void) for (n=0; n < node_record_count; n++) { node_ptr = &(node_record_table_ptr[n]); + if (slurmctld_conf.fast_schedule) { + node_cpus = node_ptr->config_ptr->cpus; + node_threads = node_ptr->config_ptr->threads; + } else { + node_cpus = node_ptr->cpus; + node_threads = node_ptr->threads; + } start = cr_get_coremap_offset(n); end = cr_get_coremap_offset(n+1); @@ -1650,6 +1658,12 @@ extern int select_p_select_nodeinfo_set_all(void) tmp_16 = tmp; } } + + /* The minimum allocatable unit may a core, so scale + * threads up to the proper CPU count */ + if ((end - start) < node_cpus) + tmp_16 *= node_threads; + node_ptr->select_nodeinfo->alloc_cpus = tmp_16; } diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index 0a76bc84445428c99b4841f501848e5f00e6a246..de02e6e1d94f9f713727ed2a8e03ae325964923b 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -106,6 +106,8 @@ static int _find_job_mate(struct job_record *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes); static void _free_node_cr(struct node_cr_record *node_cr_ptr); +static uint16_t _get_avail_cpus(struct job_record *job_ptr, int index); +static uint16_t _get_total_cpus(int index); static void _init_node_cr(void); static int _job_count_bitmap(struct node_cr_record *node_cr_ptr, struct job_record *job_ptr, @@ -367,6 +369,21 @@ static uint16_t _get_avail_cpus(struct job_record *job_ptr, int index) return(avail_cpus); } +/* + * _get_total_cpus - Get the total number of cpus on a node + * Note that the value of cpus is the lowest-level logical + * processor (LLLP). + * IN index - index of node's configuration information in select_node_ptr + */ +static uint16_t _get_total_cpus(int index) +{ + struct node_record *node_ptr = &(select_node_ptr[index]); + if (select_fast_schedule) + return node_ptr->config_ptr->cpus; + else + return node_ptr->cpus; +} + /* Build the full select_job_res_t structure for a job based upon the nodes * allocated to it (the bitmap) and the job's memory requirement */ static void _build_select_struct(struct job_record *job_ptr, bitstr_t *bitmap) @@ -670,7 +687,7 @@ static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap, int rem_cpus, rem_nodes; /* remaining resources desired */ int best_fit_nodes, best_fit_cpus, best_fit_req; int best_fit_location = 0, best_fit_sufficient; - int avail_cpus, alloc_cpus = 0; + int avail_cpus, alloc_cpus = 0, total_cpus = 0; if (bit_set_count(bitmap) < min_nodes) return error_code; @@ -718,10 +735,11 @@ static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap, /* first required node in set */ consec_req[consec_index] = index; } - rem_cpus -= avail_cpus; - alloc_cpus += avail_cpus; rem_nodes--; max_nodes--; + rem_cpus -= avail_cpus; + alloc_cpus += avail_cpus; + total_cpus += _get_total_cpus(index); } else { /* node not required (yet) */ bit_clear(bitmap, index); consec_cpus[consec_index] += avail_cpus; @@ -850,6 +868,7 @@ static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap, avail_cpus = _get_avail_cpus(job_ptr, i); rem_cpus -= avail_cpus; alloc_cpus += avail_cpus; + total_cpus += _get_total_cpus(i); } for (i = (best_fit_req - 1); i >= consec_start[best_fit_location]; i--) { @@ -864,6 +883,7 @@ static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap, avail_cpus = _get_avail_cpus(job_ptr, i); rem_cpus -= avail_cpus; alloc_cpus += avail_cpus; + total_cpus += _get_total_cpus(i); } } else { for (i = consec_start[best_fit_location]; @@ -879,6 +899,7 @@ static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap, avail_cpus = _get_avail_cpus(job_ptr, i); rem_cpus -= avail_cpus; alloc_cpus += avail_cpus; + total_cpus += _get_total_cpus(i); } } if (job_ptr->details->contiguous || @@ -896,7 +917,7 @@ static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap, } if (error_code == SLURM_SUCCESS) { /* job's total_procs is needed for SELECT_MODE_WILL_RUN */ - job_ptr->total_procs = alloc_cpus; + job_ptr->total_procs = total_cpus; } xfree(consec_cpus); @@ -924,7 +945,7 @@ static int _job_test_topo(struct job_record *job_ptr, bitstr_t *bitmap, bitstr_t *avail_nodes_bitmap = NULL; /* nodes on any switch */ bitstr_t *req_nodes_bitmap = NULL; int rem_cpus, rem_nodes; /* remaining resources desired */ - int avail_cpus, alloc_cpus = 0; + int avail_cpus, alloc_cpus = 0, total_cpus = 0; int i, j, rc = SLURM_SUCCESS; int best_fit_inx, first, last; int best_fit_nodes, best_fit_cpus; @@ -1012,6 +1033,7 @@ static int _job_test_topo(struct job_record *job_ptr, bitstr_t *bitmap, avail_cpus = _get_avail_cpus(job_ptr, i); rem_cpus -= avail_cpus; alloc_cpus += avail_cpus; + total_cpus += _get_total_cpus(i); for (j=0; j<switch_record_cnt; j++) { if (!bit_test(switches_bitmap[j], i)) continue; @@ -1049,6 +1071,7 @@ static int _job_test_topo(struct job_record *job_ptr, bitstr_t *bitmap, avail_cpus = _get_avail_cpus(job_ptr, i); rem_cpus -= avail_cpus; alloc_cpus += avail_cpus; + total_cpus += _get_total_cpus(i); } } if ((rem_nodes <= 0) && (rem_cpus <= 0)) @@ -1173,6 +1196,7 @@ static int _job_test_topo(struct job_record *job_ptr, bitstr_t *bitmap, max_nodes--; rem_cpus -= avail_cpus; alloc_cpus += avail_cpus; + total_cpus += _get_total_cpus(i); if ((max_nodes <= 0) || ((rem_nodes <= 0) && (rem_cpus <= 0))) break; @@ -1187,7 +1211,7 @@ static int _job_test_topo(struct job_record *job_ptr, bitstr_t *bitmap, fini: if (rc == SLURM_SUCCESS) { /* Job's total_procs is needed for SELECT_MODE_WILL_RUN */ - job_ptr->total_procs = alloc_cpus; + job_ptr->total_procs = total_cpus; } FREE_NULL_BITMAP(avail_nodes_bitmap); FREE_NULL_BITMAP(req_nodes_bitmap); diff --git a/src/salloc/opt.c b/src/salloc/opt.c index 3aed394b69d86421c59e0e1b779610fad04f4328..50f5012935af5792ed5bc8d020e4bd012284e170 100644 --- a/src/salloc/opt.c +++ b/src/salloc/opt.c @@ -1301,6 +1301,11 @@ static bool _opt_verify(void) setenvf(NULL, "SLURM_MEM_BIND", "%s", tmp); } } + if ((opt.ntasks_per_node != NO_VAL) && + (getenv("SLURM_NTASKS_PER_NODE") == NULL)) { + setenvf(NULL, "SLURM_NTASKS_PER_NODE", "%d", + opt.ntasks_per_node); + } return verified; } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 693ee23300cbe01ade879dcb0bc26fdde286f0ed..b287816d2db5a728e00afa1b03d0e3c1e28f678f 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -3380,16 +3380,21 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, detail_ptr->cpus_per_task = MAX(job_desc->cpus_per_task, 1); else detail_ptr->cpus_per_task = 1; - if (job_desc->ntasks_per_node != (uint16_t) NO_VAL) + if (job_desc->job_min_procs != (uint16_t) NO_VAL) + detail_ptr->job_min_procs = job_desc->job_min_procs; + if (job_desc->ntasks_per_node != (uint16_t) NO_VAL) { detail_ptr->ntasks_per_node = job_desc->ntasks_per_node; + detail_ptr->job_min_procs = MAX(detail_ptr->job_min_procs, + (detail_ptr->cpus_per_task * + detail_ptr->ntasks_per_node)); + } else { + detail_ptr->job_min_procs = MAX(detail_ptr->job_min_procs, + detail_ptr->cpus_per_task); + } if (job_desc->requeue != (uint16_t) NO_VAL) detail_ptr->requeue = MIN(job_desc->requeue, 1); else detail_ptr->requeue = slurmctld_conf.job_requeue; - if (job_desc->job_min_procs != (uint16_t) NO_VAL) - detail_ptr->job_min_procs = job_desc->job_min_procs; - detail_ptr->job_min_procs = MAX(detail_ptr->job_min_procs, - detail_ptr->cpus_per_task); if (job_desc->job_min_memory != NO_VAL) detail_ptr->job_min_memory = job_desc->job_min_memory; if (job_desc->job_min_tmp_disk != NO_VAL) diff --git a/src/srun/opt.c b/src/srun/opt.c index 74dec83a5ec3933839ab837a3c09d36c5dcf4b0b..953b3fa90b548a2eb9c2d7e4259a6013b61b5bac 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -494,6 +494,7 @@ env_vars_t env_vars[] = { {"SLURM_NSOCKETS_PER_NODE",OPT_NSOCKETS,NULL, NULL }, {"SLURM_NCORES_PER_SOCKET",OPT_NCORES, NULL, NULL }, {"SLURM_NTHREADS_PER_CORE",OPT_NTHREADS,NULL, NULL }, +{"SLURM_NTASKS_PER_NODE", OPT_INT, &opt.ntasks_per_node, NULL }, {"SLURM_NO_ROTATE", OPT_NO_ROTATE, NULL, NULL }, {"SLURM_NPROCS", OPT_INT, &opt.nprocs, &opt.nprocs_set }, {"SLURM_OVERCOMMIT", OPT_OVERCOMMIT, NULL, NULL },