From f408dd85cd785e5a66a0e324afd0ed3de51b46ee Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Wed, 16 Jan 2008 18:30:01 +0000
Subject: [PATCH] Add will_run logic to select/cons_res, fix job end-time
 sorting logic in   select/linear

---
 NEWS                                          |   5 +-
 src/plugins/select/cons_res/dist_tasks.c      |   4 +-
 src/plugins/select/cons_res/select_cons_res.c | 312 ++++++++++++++----
 src/plugins/select/linear/select_linear.c     |   8 +-
 4 files changed, 249 insertions(+), 80 deletions(-)

diff --git a/NEWS b/NEWS
index f8a290dcdc5..52aca4f748d 100644
--- a/NEWS
+++ b/NEWS
@@ -18,8 +18,9 @@ documents those changes that are of interest to users and admins.
     debug level at any time (Hongjia Cao, NUDT).
  -- Track total total suspend time for jobs and steps for accounting purposes.
  -- Add version information to partition state file.
- -- Added 'will-run' functionality to the bluegene plugin to return node 
-    list and time job can start based off other jobs running.
+ -- Added 'will-run' functionality to all of the select plugins (bluegene,
+    linear, and cons_res) to return node list and time job can start based 
+    on other jobs running.
  -- Major restructuring of node selection logic. select/linear now supports
     partition max_share parameter and tries to match like size jobs on the 
     same nodes to improve gang scheduling performance. Also supports treating 
diff --git a/src/plugins/select/cons_res/dist_tasks.c b/src/plugins/select/cons_res/dist_tasks.c
index 69a3f4f77d4..7ae7079e8ea 100644
--- a/src/plugins/select/cons_res/dist_tasks.c
+++ b/src/plugins/select/cons_res/dist_tasks.c
@@ -409,7 +409,7 @@ extern int cr_dist(struct select_cr_job *job, int cyclic,
 		this_cr_node = &select_node_ptr[host_index];
 		
 		if (job->cpus[job_index] == 0) {
-			error(" cons_res: %d no available cpus on node %s ",
+			error("cons_res: %d no available cpus on node %s ",
 			      job->job_id,
 			      node_record_table_ptr[host_index].name);
 			continue;
@@ -543,7 +543,7 @@ extern int cr_plane_dist(struct select_cr_job *job,
 		this_cr_node = &select_node_ptr[host_index];
 		
 		if (job->cpus[job_index] == 0) {
-			error(" cons_res: no available cpus on node %s", 
+			error("cons_res: no available cpus on node %s", 
 			      node_record_table_ptr[host_index].name);
 			continue;
 		}
diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c
index 47aa885b456..c23839497ec 100644
--- a/src/plugins/select/cons_res/select_cons_res.c
+++ b/src/plugins/select/cons_res/select_cons_res.c
@@ -32,27 +32,27 @@
  * 
  *  [<snip>]# squeue
  *  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
- *     5       lsf    sleep     root  PD       0:00      1 (Resources)
- *     2       lsf    sleep     root   R       0:13      4 linux[01-04]
- *     3       lsf    sleep     root   R       0:09      3 linux[01-03]
- *     4       lsf    sleep     root   R       0:05      1 linux04
+ *     5        lsf    sleep     root  PD       0:00      1 (Resources)
+ *     2        lsf    sleep     root   R       0:13      4 linux[01-04]
+ *     3        lsf    sleep     root   R       0:09      3 linux[01-03]
+ *     4        lsf    sleep     root   R       0:05      1 linux04
  *  [<snip>]#
  * 
  *  Once Job 2 finishes, Job 5, which was pending, is allocated
  *  available resources and is then running as illustrated below:
  * 
  *  [<snip>]# squeue4
- *   JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
- *     3       lsf    sleep     root   R       1:58      3 linux[01-03]
- *     4       lsf    sleep     root   R       1:54      1 linux04
- *     5       lsf    sleep     root   R       0:02      3 linux[01-03]
+ *   JOBID PARTITION    NAME     USER  ST       TIME  NODES NODELIST(REASON)
+ *     3        lsf    sleep     root   R       1:58      3 linux[01-03]
+ *     4        lsf    sleep     root   R       1:54      1 linux04
+ *     5        lsf    sleep     root   R       0:02      3 linux[01-03]
  *  [<snip>]#
  * 
  *  Job 3, Job 4, and Job 5 are now running concurrently on the cluster.
  * 
  *  [<snip>]#  squeue4
  *  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
- *     5       lsf    sleep     root   R       1:52      3 xc14n[13-15]
+ *     5        lsf    sleep     root   R       1:52      3 xc14n[13-15]
  *  [<snip>]#
  *
  * The advantage of the consumable resource scheduling policy is that
@@ -160,37 +160,42 @@ static uint32_t last_verified_job_id = 0;
 /* verify the job list after every CR_VERIFY_JOB_CYCLE jobs have finished */
 #define CR_VERIFY_JOB_CYCLE 2000
 
+static void	_cr_job_list_del(void *x);
+static int	_cr_job_list_sort(void *x, void *y);
+static struct node_cr_record *_dup_node_cr(struct node_cr_record *node_cr_ptr);
+static int	_job_test(struct job_record *job_ptr, bitstr_t *bitmap,
+			uint32_t min_nodes, uint32_t max_nodes, 
+			uint32_t req_nodes, int mode, 
+			enum node_cr_state job_node_req,
+			struct node_cr_record *select_node_ptr);
+static int 	_will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,
+			uint32_t min_nodes, uint32_t max_nodes, 
+			uint32_t req_nodes, enum node_cr_state job_node_req);
+
 #ifdef CR_DEBUG
-static void _dump_state(void)
+static void _dump_state(struct node_cr_record *select_node_ptr)
 {
-	int i;
+	int i, j, cores;
 	struct part_cr_record *parts;
 	ListIterator job_iterator;
 	struct select_cr_job *job;
 
 	for (i=0; i<select_node_cnt; i++) {
-		info("node:%s sockets:%u memory:%u state:%d",
+		info("node:%s sockets:%u alloc_memory:%u state:%d",
 			select_node_ptr[i].node_ptr->name,
 			select_node_ptr[i].num_sockets,
 			select_node_ptr[i].alloc_memory,
 			select_node_ptr[i].node_state);
 		parts = select_node_ptr[i].parts;
 		while (parts) {
-			if (parts->num_rows == 0) {
-				info("  part:%s rows:%u",
-					parts->part_name,
-					parts->num_rows);
-			} else if (parts->num_rows == 1) {
-				info("  part:%s rows:%u cores:%u",
-					parts->part_name,
-					parts->num_rows,
-					parts->alloc_cores[0]);
-			} else {
-				info("  part:%s rows:%u cores:%u,%u",
-					parts->part_name,
-					parts->num_rows,
-					parts->alloc_cores[0],
-					parts->alloc_cores[1]);
+			info("  part:%s rows:%u",
+				parts->part_name,
+				parts->num_rows);
+			cores = select_node_ptr[i].num_sockets * 
+				parts->num_rows;
+			for (j=0; j<cores; j++) {
+				info("    alloc_cores[%d]:%u",
+					j, parts->alloc_cores[j]);
 			}
 			parts = parts->next;
 		}
@@ -218,6 +223,43 @@ static void _dump_state(void)
 }
 #endif
 
+/* Create a duplicate node_cr_records structure */
+static struct node_cr_record *_dup_node_cr(struct node_cr_record *node_cr_ptr)
+{
+	int i, j;
+	struct node_cr_record *new_node_cr_ptr;
+	struct part_cr_record *part_cr_ptr, *new_part_cr_ptr;
+
+	if (node_cr_ptr == NULL)
+		return NULL;
+
+	new_node_cr_ptr = xmalloc(select_node_cnt *
+				  sizeof(struct node_cr_record));
+
+	for (i=0; i<select_node_cnt; i++) {
+		new_node_cr_ptr[i].node_ptr     = select_node_ptr[i].node_ptr;
+		new_node_cr_ptr[i].num_sockets  = select_node_ptr[i].num_sockets;
+		new_node_cr_ptr[i].alloc_memory = select_node_ptr[i].alloc_memory;
+		new_node_cr_ptr[i].node_state   = select_node_ptr[i].node_state;
+
+		part_cr_ptr = select_node_ptr[i].parts;
+		while (part_cr_ptr) {
+			new_part_cr_ptr = xmalloc(sizeof(struct part_cr_record));
+			new_part_cr_ptr->part_name  = xstrdup(part_cr_ptr->part_name);
+			new_part_cr_ptr->num_rows   = part_cr_ptr->num_rows;
+			j = sizeof(uint16_t) * part_cr_ptr->num_rows * 
+			    select_node_ptr[i].num_sockets;
+			new_part_cr_ptr->alloc_cores = xmalloc(j);
+			memcpy(new_part_cr_ptr->alloc_cores, 
+			       part_cr_ptr->alloc_cores, j);
+			new_part_cr_ptr->next        = new_node_cr_ptr[i].parts;
+			new_node_cr_ptr[i].parts     = new_part_cr_ptr;
+			part_cr_ptr = part_cr_ptr->next;
+		}
+	}
+	return new_node_cr_ptr;
+}
+
 static void _destroy_node_part_array(struct node_cr_record *this_cr_node)
 {
 	struct part_cr_record *p_ptr;
@@ -229,7 +271,17 @@ static void _destroy_node_part_array(struct node_cr_record *this_cr_node)
 		xfree(p_ptr->alloc_cores);
 	}
 	xfree(this_cr_node->parts);
-	this_cr_node->parts = NULL;
+}
+
+static void _cr_job_list_del(void *x)
+{
+	xfree(x);
+}
+static int  _cr_job_list_sort(void *x, void *y)
+{
+	struct job_record **job1_pptr = (struct job_record **) x;
+	struct job_record **job2_pptr = (struct job_record **) y;
+	return (int) difftime(job1_pptr[0]->end_time, job2_pptr[0]->end_time);
 }
 
 static void _create_node_part_array(struct node_cr_record *this_cr_node)
@@ -242,10 +294,8 @@ static void _create_node_part_array(struct node_cr_record *this_cr_node)
 		return;
 	node_ptr = this_cr_node->node_ptr;
 
-	if (this_cr_node->parts) {
+	if (this_cr_node->parts)
 		_destroy_node_part_array(this_cr_node);
-		this_cr_node->parts = NULL;
-	}
 
 	if (node_ptr->part_cnt < 1)
 		return;
@@ -425,7 +475,8 @@ static uint16_t _get_cpu_data (struct part_cr_record *p_ptr, int num_sockets,
  * IN job_ptr - pointer to job being scheduled
  * IN index - index of node's configuration information in select_node_ptr
  */
-static uint16_t _get_task_count(struct job_record *job_ptr, const int index, 
+static uint16_t _get_task_count(struct node_cr_record *select_node_ptr,
+				struct job_record *job_ptr, const int index, 
 				const bool all_available, bool try_partial_idle,
 				enum node_cr_state job_node_req)
 {
@@ -441,9 +492,7 @@ static uint16_t _get_task_count(struct job_record *job_ptr, const int index,
 	cpus_per_task   = job_ptr->details->cpus_per_task;
 	ntasks_per_node = job_ptr->details->ntasks_per_node;
 
-	if (!job_ptr->details->mc_ptr)
-		job_ptr->details->mc_ptr = create_default_mc();
-	mc_ptr = job_ptr->details->mc_ptr;
+	mc_ptr      = job_ptr->details->mc_ptr;
 	min_sockets = mc_ptr->min_sockets;
 	max_sockets = mc_ptr->max_sockets;
 	min_cores   = mc_ptr->min_cores;
@@ -464,7 +513,8 @@ static uint16_t _get_task_count(struct job_record *job_ptr, const int index,
 	if (!all_available) {
 		p_ptr = get_cr_part_ptr(this_node, job_ptr->partition);
 		if (!p_ptr) {
-			error("cons_res: _get_task_count: could not find part %s",			      job_ptr->part_ptr->name);
+			error("cons_res: _get_task_count: could not find part %s",
+			      job_ptr->part_ptr->name);
 		} else {
 			if (job_node_req == NODE_CR_ONE_ROW) {
 				/* need to scan over all partitions with
@@ -663,7 +713,7 @@ static void _append_to_job_list(struct select_cr_job *new_job)
 	list_iterator_destroy(iterator);
 	list_append(select_cr_job_list, new_job);
 	slurm_mutex_unlock(&cr_mutex);
-	debug3 (" cons_res: _append_to_job_list job_id %u to list. "
+	debug3 ("cons_res: _append_to_job_list job_id %u to list. "
 		"list_count %d ", job_id, list_count(select_cr_job_list));
 }
 
@@ -896,7 +946,8 @@ static int _add_job_to_nodes(struct select_cr_job *job, char *pre_err,
  * if remove_all = 1: deallocate all resources
  * if remove_all = 0: the job has been suspended, so just deallocate CPUs
  */
-static int _rm_job_from_nodes(struct select_cr_job *job, char *pre_err,
+static int _rm_job_from_nodes(struct node_cr_record *select_node_ptr,
+			      struct select_cr_job *job, char *pre_err,
 			      int remove_all)
 {
 	int host_index, i, j, k, rc = SLURM_SUCCESS;
@@ -1900,7 +1951,8 @@ static int _is_node_busy(struct node_cr_record *this_node)
  *  - job_node_req = NODE_CR_RESERVED, then we need idle nodes
  *  - job_node_req = NODE_CR_ONE_ROW, then we need idle or non-sharing nodes
  */
-static int _verify_node_state(struct job_record *job_ptr, bitstr_t * bitmap,
+static int _verify_node_state(struct node_cr_record *select_node_ptr,
+			      struct job_record *job_ptr, bitstr_t * bitmap,
 			      enum node_cr_state job_node_req)
 {
 	int i, free_mem;
@@ -1980,7 +2032,8 @@ static enum node_cr_state _get_job_node_req(struct job_record *job_ptr)
 	return NODE_CR_ONE_ROW;
 }
 
-static int _get_allocated_rows(struct job_record *job_ptr, int n,
+static int _get_allocated_rows(struct node_cr_record *select_node_ptr,
+			       struct job_record *job_ptr, int n,
 			       enum node_cr_state job_node_req)
 {
 	struct part_cr_record *p_ptr;
@@ -2002,7 +2055,8 @@ static int _get_allocated_rows(struct job_record *job_ptr, int n,
 	return rows;
 }
 
-static int _load_arrays(struct job_record *job_ptr, bitstr_t *bitmap, 
+static int _load_arrays(struct node_cr_record *select_node_ptr,
+			struct job_record *job_ptr, bitstr_t *bitmap, 
 			int **a_rows, int **s_tasks, int **a_tasks, 
 			int **freq, bool test_only,
 			enum node_cr_state job_node_req)
@@ -2020,15 +2074,18 @@ static int _load_arrays(struct job_record *job_ptr, bitstr_t *bitmap,
 		if (bit_test(bitmap, i)) {
 			int rows;
 			uint16_t atasks, ptasks;
-			rows = _get_allocated_rows(job_ptr, i, job_node_req);
+			rows = _get_allocated_rows(select_node_ptr, job_ptr, 
+						   i, job_node_req);
 			/* false = use free rows (if available) */
-			atasks = _get_task_count(job_ptr, i, test_only, false,
+			atasks = _get_task_count(select_node_ptr, job_ptr, i, 
+						 test_only, false,
 						 job_node_req);
 			if (test_only) {
 				ptasks = atasks;
 			} else {
 				/* true = try using an already allocated row */
-				ptasks = _get_task_count(job_ptr, i, test_only,
+				ptasks = _get_task_count(select_node_ptr, 
+							 job_ptr, i, test_only,
 							 true, job_node_req);
 			}
 			if (rows   != busy_rows[index] ||
@@ -2122,42 +2179,142 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap,
 			     uint32_t min_nodes, uint32_t max_nodes, 
 			     uint32_t req_nodes, int mode)
 {
-	int a, f, i, j, k, error_code, ll; /* ll = layout array index */
-	struct multi_core_data *mc_ptr = NULL;
-	static struct select_cr_job *job;
-	uint16_t * layout_ptr = NULL;
 	enum node_cr_state job_node_req;
-	int  array_size;
-	int *busy_rows, *sh_tasks, *al_tasks, *freq;
-	bitstr_t *origmap, *reqmap = NULL;
-	int row, rows, try;
-	bool test_only;
 
 	xassert(bitmap);
 
-	if (mode == SELECT_MODE_TEST_ONLY)
-		test_only = true;
-	else if (mode == SELECT_MODE_RUN_NOW)
-		test_only = false;
-	else	/* SELECT_MODE_WILL_RUN */
-		return EINVAL;	/* not yet supported */
-
 	if (!job_ptr->details)
 		return EINVAL;
 
-	layout_ptr = job_ptr->details->req_node_layout;
 	if (!job_ptr->details->mc_ptr)
 		job_ptr->details->mc_ptr = create_default_mc();
-	mc_ptr = job_ptr->details->mc_ptr;
-	reqmap = job_ptr->details->req_node_bitmap;
 	job_node_req = _get_job_node_req(job_ptr);
 
-	debug3("cons_res: select_p_job_test: job %d node_req %d, test_only %d",
-	       job_ptr->job_id, job_node_req, test_only);
+	debug3("cons_res: select_p_job_test: job %d node_req %d, mode %d",
+	       job_ptr->job_id, job_node_req, mode);
 	debug3("cons_res: select_p_job_test: min_n %u max_n %u req_n %u",
 	       min_nodes, max_nodes, req_nodes);
 	
+	if (mode == SELECT_MODE_WILL_RUN) {
+		return _will_run_test(job_ptr, bitmap, min_nodes, max_nodes,
+				      req_nodes, job_node_req);
+	}
+
+	return _job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, 
+			 mode, job_node_req, select_node_ptr);
+}
+
+/* _will_run_test - determine when and where a pending job can start, removes 
+ *	jobs from node table at termination time and run _test_job() after 
+ *	each one. */
+static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,
+			uint32_t min_nodes, uint32_t max_nodes, 
+			uint32_t req_nodes, enum node_cr_state job_node_req)
+{
+	struct node_cr_record *exp_node_cr;
+	struct job_record *tmp_job_ptr, **tmp_job_pptr;
+	struct select_cr_job *job;
+	List cr_job_list;
+	ListIterator job_iterator;
+	bitstr_t *orig_map;
+	int rc = SLURM_ERROR;
+	uint16_t saved_state;
+
+	orig_map = bit_copy(bitmap);
+
+	/* Try to run with currently available nodes */
+	rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, 
+		       SELECT_MODE_RUN_NOW, job_node_req, select_node_ptr);
+	if (rc == SLURM_SUCCESS) {
+		bit_free(orig_map);
+		job_ptr->start_time = time(NULL);
+		return SLURM_SUCCESS;
+	}
+
+	/* Job is still pending. Simulate termination of jobs one at a time 
+	 * to determine when and where the job can start. */
+	exp_node_cr = _dup_node_cr(select_node_ptr);
+	if (exp_node_cr == NULL) {
+		bit_free(orig_map);
+		return SLURM_ERROR;
+	}
+
+	/* Build list of running jobs */
+	cr_job_list = list_create(_cr_job_list_del);
+	job_iterator = list_iterator_create(job_list);
+	while ((tmp_job_ptr = (struct job_record *) list_next(job_iterator))) {
+		if (tmp_job_ptr->job_state != JOB_RUNNING)
+			continue;
+		if (tmp_job_ptr->end_time == 0) {
+			error("Job %u has zero end_time", tmp_job_ptr->job_id);
+			continue;
+		}
+		tmp_job_pptr = xmalloc(sizeof(struct job_record *));
+		*tmp_job_pptr = tmp_job_ptr;
+		list_append(cr_job_list, tmp_job_pptr);
+	}
+	list_iterator_destroy(job_iterator);
+	list_sort(cr_job_list, _cr_job_list_sort);
+
+	/* Remove the running jobs one at a time from exp_node_cr and try
+	 * scheduling the pending job after each one */
+	job_iterator = list_iterator_create(cr_job_list);
+	while ((tmp_job_pptr = (struct job_record **) list_next(job_iterator))) {
+		tmp_job_ptr = *tmp_job_pptr;
+		job = list_find_first(select_cr_job_list, _find_job_by_id,
+				      &tmp_job_ptr->job_id);
+		if (!job) {
+			error("cons_res: could not find job %u", 
+			      tmp_job_ptr->job_id);
+			continue;
+		}
+		saved_state = job->state;
+		_rm_job_from_nodes(exp_node_cr, job, "_will_run_test", 1);
+		job->state = saved_state;
+		rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes, 
+			       req_nodes, SELECT_MODE_WILL_RUN, job_node_req,
+			       exp_node_cr);
+		if (rc == SLURM_SUCCESS) {
+			job_ptr->start_time = tmp_job_ptr->end_time;
+			break;
+		}
+		bit_or(bitmap, orig_map);
+	}
+	list_iterator_destroy(job_iterator);
+	list_destroy(cr_job_list);
+	_destroy_node_part_array(exp_node_cr);
+	bit_free(orig_map);
+	return rc;
+}
+
+/* _job_test - does most of the real work for select_p_job_test(), which 
+ *	pretty much just handles load-leveling and max_share logic */
+static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap,
+			uint32_t min_nodes, uint32_t max_nodes, 
+			uint32_t req_nodes, int mode, 
+			enum node_cr_state job_node_req,
+			struct node_cr_record *select_node_ptr)
+{
+	int a, f, i, j, k, error_code, ll; /* ll = layout array index */
+	struct multi_core_data *mc_ptr = NULL;
+	static struct select_cr_job *job;
+	uint16_t * layout_ptr = NULL;
+	int  array_size;
+	int *busy_rows, *sh_tasks, *al_tasks, *freq;
+	bitstr_t *origmap, *reqmap = NULL;
+	int row, rows, try;
+	bool test_only;
+
+	layout_ptr = job_ptr->details->req_node_layout;
+	mc_ptr = job_ptr->details->mc_ptr;
+	reqmap = job_ptr->details->req_node_bitmap;
+
 	/* check node_state and update bitmap as necessary */
+	if (mode == SELECT_MODE_TEST_ONLY)
+		test_only = true;
+	else	/* SELECT_MODE_RUN_NOW || SELECT_MODE_WILL_RUN  */ 
+		test_only = false;
+
 	if (!test_only) {
 #if 0
 		/* Done in slurmctld/node_scheduler.c: _pick_best_nodes() */
@@ -2165,13 +2322,13 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap,
 		    (cr_type != CR_MEMORY) && (cr_type != CR_SOCKET_MEMORY))
 			job_ptr->details->job_max_memory = 0;
 #endif
-		error_code = _verify_node_state(job_ptr, bitmap, job_node_req);
+		error_code = _verify_node_state(select_node_ptr, job_ptr, 
+						bitmap, job_node_req);
 		if (error_code != SLURM_SUCCESS)
 			return error_code;
 	}
 
 	/* This is the case if -O/--overcommit  is true */ 
-	debug3("job_ptr->num_procs %u", job_ptr->num_procs);
 	if (job_ptr->num_procs == job_ptr->details->min_nodes) {
 		job_ptr->num_procs *= MAX(1, mc_ptr->min_threads);
 		job_ptr->num_procs *= MAX(1, mc_ptr->min_cores);
@@ -2179,8 +2336,9 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap,
 	}
 
 	/* compute condensed arrays of node allocation data */
-	array_size = _load_arrays(job_ptr, bitmap, &busy_rows, &sh_tasks,
-				   &al_tasks, &freq, test_only, job_node_req);
+	array_size = _load_arrays(select_node_ptr, job_ptr, bitmap, &busy_rows,
+				  &sh_tasks, &al_tasks, &freq, test_only, 
+				  job_node_req);
 
 	if (test_only) {
         	/* try with all nodes and all possible cpus */
@@ -2257,6 +2415,15 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap,
 					al_tasks[i], freq[i]);
 			}
 
+			if (row > 1) {
+				/* We need to share resources. 
+				 * Try to find suitable job to share nodes with. */
+
+				/* FIXME: To be added. There is some simple logic 
+				 * to do this in select/linear.c:_find_job_mate(), 
+				 * but the data structures here are very different */
+			}
+
 			error_code = _select_nodes(job_ptr, bitmap, min_nodes,
 						   max_nodes, req_nodes,
 						   sh_tasks, freq, array_size);
@@ -2268,7 +2435,7 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap,
 	}
 
 	bit_free(origmap);
-	if (error_code != SLURM_SUCCESS) {
+	if ((error_code != SLURM_SUCCESS) || (mode == SELECT_MODE_WILL_RUN)) {
 		xfree(busy_rows);
 		xfree(sh_tasks);
 		xfree(al_tasks);
@@ -2437,7 +2604,7 @@ extern int select_p_job_fini(struct job_record *job_ptr)
 		return SLURM_ERROR;
 	}
 	
-	_rm_job_from_nodes(job, "select_p_job_fini", 1);
+	_rm_job_from_nodes(select_node_ptr, job, "select_p_job_fini", 1);
 
 	slurm_mutex_lock(&cr_mutex);
 	list_remove(iterator);
@@ -2467,7 +2634,8 @@ extern int select_p_job_suspend(struct job_record *job_ptr)
 	if (!job)
 		return ESLURM_INVALID_JOB_ID;
 
-	rc = _rm_job_from_nodes(job, "select_p_job_suspend", 0);
+	rc = _rm_job_from_nodes(select_node_ptr, job, 
+				"select_p_job_suspend", 0);
 	return SLURM_SUCCESS;
 }
 
diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c
index 07c29e3cf2a..d76ec44a975 100644
--- a/src/plugins/select/linear/select_linear.c
+++ b/src/plugins/select/linear/select_linear.c
@@ -1329,7 +1329,7 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,
 	}
 	list_iterator_destroy(job_iterator);
 	list_destroy(cr_job_list);
-
+	_free_node_cr(exp_node_cr);
 	bit_free(orig_map);
 	return rc;
 }
@@ -1340,7 +1340,7 @@ static void _cr_job_list_del(void *x)
 }
 static int  _cr_job_list_sort(void *x, void *y)
 {
-	struct job_record *job1_ptr = (struct job_record *) x;
-	struct job_record *job2_ptr = (struct job_record *) y;
-	return (int) job1_ptr->end_time - job2_ptr->end_time;
+	struct job_record **job1_pptr = (struct job_record **) x;
+	struct job_record **job2_pptr = (struct job_record **) y;
+	return (int) difftime(job1_pptr[0]->end_time, job2_pptr[0]->end_time);
 }
-- 
GitLab