From dab7fb0251ec5a713460d42f60e040e492d78ed6 Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Tue, 22 Oct 2013 10:44:46 -0700
Subject: [PATCH] Problem allocating threads with GPUs

If a node has GRES and multiple threads per core the select/cons_res
plugin can get stuck in an infinite loop.
See bug 475
Contributed by:
PREVOST Ludovic
NEC HPC Europe
---
 NEWS                                   |  2 ++
 src/plugins/select/cons_res/job_test.c | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/NEWS b/NEWS
index 2059b6d74b6..33ac2b275db 100644
--- a/NEWS
+++ b/NEWS
@@ -25,6 +25,8 @@ documents those changes that are of interest to users and admins.
  -- init scripts ignore quotes around Pid file name specifications.
  -- Fixed typo about command case in quickstart.html.
  -- task/cgroup - handle new cpuset files, similar to commit c4223940.
+ -- select/cons_res with GRES and multiple threads per core, fix possible
+    infinite loop.
 
 * Changes in Slurm 2.6.3
 ========================
diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c
index 8bea0989519..48eb5bbe2c7 100644
--- a/src/plugins/select/cons_res/job_test.c
+++ b/src/plugins/select/cons_res/job_test.c
@@ -717,9 +717,21 @@ uint16_t _can_job_run_on_node(struct job_record *job_ptr, bitstr_t *core_map,
 	    ((job_ptr->details->cpus_per_task > 1) &&
 	     (gres_cpus < job_ptr->details->cpus_per_task)))
 		gres_cpus = 0;
+
 	while (gres_cpus < cpus)
 		cpus -= cpu_alloc_size;
 
+	while (gres_cpus < cpus) {
+		if ((int) cpus < cpu_alloc_size) {
+			debug3("cons_res: cpu_alloc_size > cpus, cannot "
+			       "continue (node: %s)", node_ptr->name);
+			cpus = 0;
+			break;
+		} else {
+			cpus -= cpu_alloc_size;
+		}
+	}
+
 	if (cpus == 0)
 		bit_nclear(core_map, core_start_bit, core_end_bit);
 
-- 
GitLab