From 43281b779ed0ed9cf69fd030c61fa2012e963b10 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 25 Nov 2008 18:18:33 +0000
Subject: [PATCH] Fix job preemption when sched/gang and select/linear are
 configured with     non-sharing partitions.     preempt_linear.patch from
 Chris Holmes

---
 NEWS                                          |  2 +
 src/plugins/select/cons_res/select_cons_res.c |  2 +-
 src/plugins/select/linear/select_linear.c     | 71 ++++++++++++++++++-
 src/slurmctld/node_scheduler.c                |  7 ++
 4 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/NEWS b/NEWS
index e4d75a0dc72..a6fd38f1bc6 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,8 @@ documents those changes that are of interest to users and admins.
 
 * Changes in SLURM 1.4.0-pre6
 =============================
+ -- Fix job preemption when sched/gang and select/linear are configured with
+    non-sharing partitions.
 
 * Changes in SLURM 1.4.0-pre5
 =============================
diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c
index 6e453a80736..4bce32878fa 100644
--- a/src/plugins/select/cons_res/select_cons_res.c
+++ b/src/plugins/select/cons_res/select_cons_res.c
@@ -228,7 +228,7 @@ extern bool cr_priority_selection_enabled()
 		if (strcmp(sched_type, "sched/gang") == 0)
 			cr_priority_selection = true;
 		xfree(sched_type);
-		cr_priority_selection = true;
+		cr_priority_test = true;
 	}
 	return cr_priority_selection;
 	
diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c
index 99038cce410..f2992bf12a0 100644
--- a/src/plugins/select/linear/select_linear.c
+++ b/src/plugins/select/linear/select_linear.c
@@ -135,6 +135,8 @@ static struct node_record *select_node_ptr = NULL;
 static int select_node_cnt = 0;
 static uint16_t select_fast_schedule;
 static uint16_t cr_type;
+static bool cr_priority_test      = false;
+static bool cr_priority_selection = false;
 
 static struct node_cr_record *node_cr_ptr = NULL;
 static pthread_mutex_t cr_mutex = PTHREAD_MUTEX_INITIALIZER;
@@ -240,6 +242,19 @@ static int _fini_status_pthread(void)
 }
 #endif
 
+static inline bool _cr_priority_selection_enabled(void)
+{
+	if (!cr_priority_test) {
+		char *sched_type = slurm_get_sched_type();
+		if (strcmp(sched_type, "sched/gang") == 0)
+			cr_priority_selection = true;
+		xfree(sched_type);
+		cr_priority_test = true;
+	}
+	return cr_priority_selection;
+	
+}
+
 static bool _enough_nodes(int avail_nodes, int rem_nodes, 
 		uint32_t min_nodes, uint32_t req_nodes)
 {
@@ -556,7 +571,7 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap,
 	}
 
 	if (mode != SELECT_MODE_TEST_ONLY) {
-		if (job_ptr->details->shared == 1) {
+		if (job_ptr->details->shared) {
 			max_share = job_ptr->part_ptr->max_share & 
 					~SHARED_FORCE;
 		} else	/* ((shared == 0) || (shared == (uint16_t) NO_VAL)) */
@@ -575,6 +590,8 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap,
 		job_ptr->details->job_min_memory = 0;
 	}
 
+	debug3("select/linear: job_test: job %u max_share %d avail nodes %u",
+		job_ptr->job_id, max_share, bit_set_count(bitmap));
 	orig_map = bit_copy(bitmap);
 	for (max_run_job=min_share; max_run_job<max_share; max_run_job++) {
 		bool last_iteration = (max_run_job == (max_share -1));
@@ -586,6 +603,8 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap,
 					      orig_map, bitmap, 
 					      max_run_job, 
 					      max_run_job + sus_jobs);
+			debug3("select/linear: job_test: found %d nodes for %u",
+				j, job_ptr->job_id);
 			if ((j == prev_cnt) || (j < min_nodes))
 				continue;
 			prev_cnt = j;
@@ -697,12 +716,62 @@ static int _job_count_bitmap(struct node_cr_record *node_cr_ptr,
 		}
 
 		if ((run_job_cnt != NO_SHARE_LIMIT) &&
+		    (!_cr_priority_selection_enabled()) &&
 		    (node_cr_ptr[i].exclusive_jobid != 0)) {
 			/* already reserved by some exclusive job */
 			bit_clear(jobmap, i);
 			continue;
 		}
 
+		if (_cr_priority_selection_enabled()) {
+			/* clear this node if any higher-priority
+			 * partitions have existing allocations */
+			total_jobs = 0;
+			part_cr_ptr = node_cr_ptr[i].parts;
+			for( ;part_cr_ptr; part_cr_ptr = part_cr_ptr->next) {
+				if (part_cr_ptr->part_ptr->priority <=
+				    job_ptr->part_ptr->priority)
+					continue;
+				total_jobs += part_cr_ptr->tot_job_cnt;
+			}
+			if ((run_job_cnt != NO_SHARE_LIMIT) &&
+			    (total_jobs > 0)) {
+				bit_clear(jobmap, i);
+				continue;
+			}
+			/* if not sharing, then check with other partitions
+			 * of equal priority. Otherwise, load-balance within
+			 * the local partition */
+			total_jobs = 0;
+			total_run_jobs = 0;
+			part_cr_ptr = node_cr_ptr[i].parts;
+			for( ; part_cr_ptr; part_cr_ptr = part_cr_ptr->next) {
+				if (part_cr_ptr->part_ptr->priority !=
+				    job_ptr->part_ptr->priority)
+					continue;
+				if (!job_ptr->details->shared) {
+					total_run_jobs +=
+						      part_cr_ptr->run_job_cnt;
+					total_jobs += part_cr_ptr->tot_job_cnt;
+					continue;
+				}
+				if (part_cr_ptr->part_ptr == job_ptr->part_ptr){
+					total_run_jobs +=
+						      part_cr_ptr->run_job_cnt;
+					total_jobs += part_cr_ptr->tot_job_cnt;
+					break;
+				}
+			}
+			if ((total_run_jobs <= run_job_cnt) &&
+			    (total_jobs     <= tot_job_cnt)) {
+				bit_set(jobmap, i);
+				count++;
+			} else {
+				bit_clear(jobmap, i);
+			}
+			continue;
+		}
+
 		total_jobs = 0;
 		total_run_jobs = 0;
 		part_cr_ptr = node_cr_ptr[i].parts;
diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c
index 51260782bcc..1997099601c 100644
--- a/src/slurmctld/node_scheduler.c
+++ b/src/slurmctld/node_scheduler.c
@@ -278,6 +278,13 @@ static int _match_feature(char *seek, struct node_set *node_set_ptr)
  *	(uint16_t)NO_VAL	= default
  *	0			= exclusive
  *	1			= share=yes
+ *
+ * Return values:
+ *	0 = no sharing
+ *	1 = user requested sharing
+ *	2 = sharing enforced (either by partition or cons_res)
+ * (cons_res plugin needs to distinguish between "enforced" and
+ *  "requested" sharing)
  */
 static int
 _resolve_shared_status(uint16_t user_flag, uint16_t part_max_share,
-- 
GitLab