From dab7fb0251ec5a713460d42f60e040e492d78ed6 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Tue, 22 Oct 2013 10:44:46 -0700 Subject: [PATCH] Problem allocating threads with GPUs If a node has GRES and multiple threads per core the select/cons_res plugin can get stuck in an infinite loop. See bug 475 Contributed by: PREVOST Ludovic NEC HPC Europe --- NEWS | 2 ++ src/plugins/select/cons_res/job_test.c | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/NEWS b/NEWS index 2059b6d74b6..33ac2b275db 100644 --- a/NEWS +++ b/NEWS @@ -25,6 +25,8 @@ documents those changes that are of interest to users and admins. -- init scripts ignore quotes around Pid file name specifications. -- Fixed typo about command case in quickstart.html. -- task/cgroup - handle new cpuset files, similar to commit c4223940. + -- select/cons_res with GRES and multiple threads per core, fix possible + infinite loop. * Changes in Slurm 2.6.3 ======================== diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c index 8bea0989519..48eb5bbe2c7 100644 --- a/src/plugins/select/cons_res/job_test.c +++ b/src/plugins/select/cons_res/job_test.c @@ -717,9 +717,21 @@ uint16_t _can_job_run_on_node(struct job_record *job_ptr, bitstr_t *core_map, ((job_ptr->details->cpus_per_task > 1) && (gres_cpus < job_ptr->details->cpus_per_task))) gres_cpus = 0; + while (gres_cpus < cpus) cpus -= cpu_alloc_size; + while (gres_cpus < cpus) { + if ((int) cpus < cpu_alloc_size) { + debug3("cons_res: cpu_alloc_size > cpus, cannot " + "continue (node: %s)", node_ptr->name); + cpus = 0; + break; + } else { + cpus -= cpu_alloc_size; + } + } + if (cpus == 0) bit_nclear(core_map, core_start_bit, core_end_bit); -- GitLab