From b5a8a742368ba3b9b1a8825770e954c49057245d Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Fri, 9 Sep 2011 13:42:49 -0700
Subject: [PATCH] Improve performance of preemption logic

This modifcation improves the performance of SLURM's preemption logic
be reducing the execution time of the scheduling logic and doing a better
job of minimizing the number of job's preempted to initiate a new job.
Based largely upon work by Phil Eckert, LLNL.
---
 src/plugins/select/cons_res/select_cons_res.c | 73 +++++++++++----
 src/plugins/select/linear/select_linear.c     | 90 +++++++++++++------
 src/slurmctld/slurmctld.h                     |  1 +
 3 files changed, 123 insertions(+), 41 deletions(-)

diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c
index bd38af07501..02413b03d8d 100644
--- a/src/plugins/select/cons_res/select_cons_res.c
+++ b/src/plugins/select/cons_res/select_cons_res.c
@@ -208,6 +208,8 @@ static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 		    uint32_t min_nodes, uint32_t max_nodes,
 		    uint32_t req_nodes, uint16_t job_node_req,
 		    List preemptee_candidates, List *preemptee_job_list);
+static int _sort_usable_nodes_dec(struct job_record *job_a,
+				  struct job_record *job_b);
 static int _test_only(struct job_record *job_ptr, bitstr_t *bitmap,
 		      uint32_t min_nodes, uint32_t max_nodes,
  		      uint32_t req_nodes, uint16_t job_node_req);
@@ -1495,6 +1497,21 @@ static int _test_only(struct job_record *job_ptr, bitstr_t *bitmap,
 	return rc;
 }
 
+/*
+ * Sort the usable_node element to put jobs in the correct
+ * preemption order.
+ */
+static int _sort_usable_nodes_dec(struct job_record *job_a,
+				  struct job_record *job_b)
+{
+	if (job_a->details->usable_nodes > job_b->details->usable_nodes)
+		return -1;
+	else if (job_a->details->usable_nodes < job_b->details->usable_nodes)
+		return 1;
+
+	return 0;
+}
+
 /* Allocate resources for a job now, if possible */
 static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 		    uint32_t min_nodes, uint32_t max_nodes,
@@ -1502,15 +1519,17 @@ static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 		    List preemptee_candidates, List *preemptee_job_list)
 {
 	int rc;
-	bitstr_t *orig_map;
+	bitstr_t *orig_map = NULL, *save_bitmap;
 	struct job_record *tmp_job_ptr;
 	ListIterator job_iterator, preemptee_iterator;
 	struct part_res_record *future_part;
 	struct node_use_record *future_usage;
 	bool remove_some_jobs = false;
+	uint16_t pass_count = 0;
 	uint16_t mode;
 
-	orig_map = bit_copy(bitmap);
+	save_bitmap = bit_copy(bitmap);
+top:	orig_map = bit_copy(save_bitmap);
 	if (!orig_map)
 		fatal("bit_copy: malloc failure");
 
@@ -1524,16 +1543,18 @@ static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 		future_part = _dup_part_data(select_part_record);
 		if (future_part == NULL) {
 			FREE_NULL_BITMAP(orig_map);
+			FREE_NULL_BITMAP(save_bitmap);
 			return SLURM_ERROR;
 		}
 		future_usage = _dup_node_usage(select_node_usage);
 		if (future_usage == NULL) {
 			_destroy_part_data(future_part);
 			FREE_NULL_BITMAP(orig_map);
+			FREE_NULL_BITMAP(save_bitmap);
 			return SLURM_ERROR;
 		}
 
-		job_iterator = list_iterator_create(job_list);
+		job_iterator = list_iterator_create(preemptee_candidates);
 		if (job_iterator == NULL)
 			fatal ("memory allocation failure");
 		while ((tmp_job_ptr = (struct job_record *)
@@ -1546,20 +1567,37 @@ static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 			    (mode != PREEMPT_MODE_CHECKPOINT) &&
 			    (mode != PREEMPT_MODE_CANCEL))
 				continue;	/* can't remove job */
-			if (_is_preemptable(tmp_job_ptr,
-					    preemptee_candidates)) {
-				/* Remove preemptable job now */
-				_rm_job_from_res(future_part, future_usage,
-						 tmp_job_ptr, 0);
-				bit_or(bitmap, orig_map);
-				rc = cr_job_test(job_ptr, bitmap, min_nodes,
-						 max_nodes, req_nodes,
-						 SELECT_MODE_WILL_RUN,
-						 cr_type, job_node_req,
-						 select_node_cnt,
-						 future_part, future_usage);
-				if (rc == SLURM_SUCCESS)
+			/* Remove preemptable job now */
+			_rm_job_from_res(future_part, future_usage,
+					 tmp_job_ptr, 0);
+			bit_or(bitmap, orig_map);
+			rc = cr_job_test(job_ptr, bitmap, min_nodes,
+					 max_nodes, req_nodes,
+					 SELECT_MODE_WILL_RUN,
+					 cr_type, job_node_req,
+					 select_node_cnt,
+					 future_part, future_usage);
+			tmp_job_ptr->details->usable_nodes =
+				 bit_overlap(bitmap, tmp_job_ptr->node_bitmap);
+			/*
+			 * If successful, set the last job's usable count to a
+			 * large value so that it will be first after sorting.
+			 * Note: usable_count is only used for sorting purposes
+			 */
+			if (rc == SLURM_SUCCESS) {
+				if (pass_count++ ||
+				    (list_count(preemptee_candidates) == 1))
 					break;
+				tmp_job_ptr->details->usable_nodes = 9999;
+				while ((tmp_job_ptr = (struct job_record *)
+					list_next(job_iterator))) {
+					tmp_job_ptr->details->usable_nodes = 0;
+				}
+				list_sort(preemptee_candidates,
+					  (ListCmpF)_sort_usable_nodes_dec);
+				FREE_NULL_BITMAP(orig_map);
+				list_iterator_destroy(job_iterator);
+				goto top;
 			}
 		}
 		list_iterator_destroy(job_iterator);
@@ -1587,6 +1625,8 @@ static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 				if (bit_overlap(bitmap,
 						tmp_job_ptr->node_bitmap) == 0)
 					continue;
+				if (tmp_job_ptr->details->usable_nodes == 0)
+					continue;
 				list_append(*preemptee_job_list,
 					    tmp_job_ptr);
 				remove_some_jobs = true;
@@ -1602,6 +1642,7 @@ static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 		_destroy_node_data(future_usage, NULL);
 	}
 	FREE_NULL_BITMAP(orig_map);
+	FREE_NULL_BITMAP(save_bitmap);
 
 	return rc;
 }
diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c
index 21272745bbb..03104854399 100644
--- a/src/plugins/select/linear/select_linear.c
+++ b/src/plugins/select/linear/select_linear.c
@@ -153,6 +153,8 @@ static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 		    int max_share, uint32_t req_nodes,
 		    List preemptee_candidates,
 		    List *preemptee_job_list);
+static int _sort_usable_nodes_dec(struct job_record *job_a,
+				  struct job_record *job_b);
 static bool _test_run_job(struct cr_record *cr_ptr, uint32_t job_id);
 static bool _test_tot_job(struct cr_record *cr_ptr, uint32_t job_id);
 static int _test_only(struct job_record *job_ptr, bitstr_t *bitmap,
@@ -2309,6 +2311,21 @@ static int _test_only(struct job_record *job_ptr, bitstr_t *bitmap,
 	return rc;
 }
 
+/*
+ * Sort the usable_node element to put jobs in the correct
+ * preemption order.
+ */
+static int _sort_usable_nodes_dec(struct job_record *job_a,
+				  struct job_record *job_b)
+{
+	if (job_a->details->usable_nodes > job_b->details->usable_nodes)
+		return -1;
+	else if (job_a->details->usable_nodes < job_b->details->usable_nodes)
+		return 1;
+
+	return 0;
+}
+
 /* Allocate resources for a job now, if possible */
 static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 		    uint32_t min_nodes, uint32_t max_nodes,
@@ -2322,6 +2339,7 @@ static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 	struct job_record *tmp_job_ptr;
 	ListIterator job_iterator, preemptee_iterator;
 	struct cr_record *exp_cr;
+	uint16_t pass_count = 0;
 
 	orig_map = bit_copy(bitmap);
 	if (!orig_map)
@@ -2365,39 +2383,59 @@ static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 		}
 	}
 
-	if ((rc != SLURM_SUCCESS) && preemptee_candidates &&
+top:	if ((rc != SLURM_SUCCESS) && preemptee_candidates &&
 	    (exp_cr = _dup_cr(cr_ptr))) {
 		/* Remove all preemptable jobs from simulated environment */
-		job_iterator = list_iterator_create(job_list);
+		job_iterator = list_iterator_create(preemptee_candidates);
+		if (job_iterator == NULL)
+			fatal ("memory allocation failure in linear");
 		while ((tmp_job_ptr = (struct job_record *)
 			list_next(job_iterator))) {
+			bool remove_all = false;
+			uint16_t mode;
+
 			if (!IS_JOB_RUNNING(tmp_job_ptr) &&
 			    !IS_JOB_SUSPENDED(tmp_job_ptr))
 				continue;
-			if (_is_preemptable(tmp_job_ptr,
-					    preemptee_candidates)) {
-				bool remove_all = false;
-				uint16_t mode;
-				mode = slurm_job_preempt_mode(tmp_job_ptr);
-				if ((mode == PREEMPT_MODE_REQUEUE)    ||
-				    (mode == PREEMPT_MODE_CHECKPOINT) ||
-				    (mode == PREEMPT_MODE_CANCEL))
-					remove_all = true;
-				/* Remove preemptable job now */
-				_rm_job_from_nodes(exp_cr, tmp_job_ptr,
-						   "_run_now",
-						   remove_all);
-				j = _job_count_bitmap(exp_cr, job_ptr,
-						      orig_map, bitmap,
-						      (max_share - 1),
-						      NO_SHARE_LIMIT,
-						      SELECT_MODE_RUN_NOW);
-				if (j < min_nodes)
-					continue;
-				rc = _job_test(job_ptr, bitmap, min_nodes,
-					       max_nodes, req_nodes);
-				if (rc == SLURM_SUCCESS)
+			mode = slurm_job_preempt_mode(tmp_job_ptr);
+			if ((mode == PREEMPT_MODE_REQUEUE)    ||
+			    (mode == PREEMPT_MODE_CHECKPOINT) ||
+			    (mode == PREEMPT_MODE_CANCEL))
+				remove_all = true;
+			/* Remove preemptable job now */
+			_rm_job_from_nodes(exp_cr, tmp_job_ptr,
+					   "_run_now",
+					   remove_all);
+			j = _job_count_bitmap(exp_cr, job_ptr,
+					      orig_map, bitmap,
+					      (max_share - 1),
+					      NO_SHARE_LIMIT,
+					      SELECT_MODE_RUN_NOW);
+			tmp_job_ptr->details->usable_nodes =
+				bit_overlap(bitmap, tmp_job_ptr->node_bitmap);
+			if (j < min_nodes)
+				continue;
+			rc = _job_test(job_ptr, bitmap, min_nodes,
+				       max_nodes, req_nodes);
+			/*
+			 * If successful, set the last job's usable count to a
+			 * large value so that it will be first after sorting.
+			 * Note: usable_count is only used for sorting purposes
+			 */
+			if (rc == SLURM_SUCCESS) {
+				if (pass_count++ ||
+				    (list_count(preemptee_candidates) == 1))
 					break;
+				tmp_job_ptr->details->usable_nodes = 9999;
+				while ((tmp_job_ptr = (struct job_record *)
+					list_next(job_iterator))) {
+					tmp_job_ptr->details->usable_nodes = 0;
+				}
+				list_sort(preemptee_candidates,
+					  (ListCmpF)_sort_usable_nodes_dec);
+				rc = EINVAL;
+				list_iterator_destroy(job_iterator);
+				goto top;
 			}
 		}
 		list_iterator_destroy(job_iterator);
@@ -2418,6 +2456,8 @@ static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 				if (bit_overlap(bitmap,
 						tmp_job_ptr->node_bitmap) == 0)
 					continue;
+				if (tmp_job_ptr->details->usable_nodes == 0)
+					continue;
 				list_append(*preemptee_job_list,
 					    tmp_job_ptr);
 			}
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index fb964bd8193..a0e633bfce0 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -415,6 +415,7 @@ struct job_details {
 	uint16_t task_dist;		/* task layout for this job. Only
 					 * useful when Consumable Resources
                                          * is enabled */
+	uint32_t usable_nodes;		/* node count needed by preemption */
 	char *work_dir;			/* pathname of working directory */
 };
 
-- 
GitLab