From 075ecb52e99dbc38eddc71c9903c395ab792e1b7 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 6 Apr 2010 23:04:38 +0000
Subject: [PATCH] get select/linear to allocate resources released via job
 shrinking. Fix a file permissions and error logging problems with scontrol
 shrinking jobs

---
 src/common/node_select.c                      |  17 ++
 src/common/node_select.h                      |   8 +
 .../select/bluegene/plugin/select_bluegene.c  |   6 +
 src/plugins/select/cons_res/select_cons_res.c |   7 +
 src/plugins/select/linear/select_linear.c     | 191 ++++++++++++++++--
 src/scontrol/update_job.c                     |   8 +-
 src/slurmctld/job_mgr.c                       |   5 +
 7 files changed, 228 insertions(+), 14 deletions(-)

diff --git a/src/common/node_select.c b/src/common/node_select.c
index 460349da21e..b3a5209c775 100644
--- a/src/common/node_select.c
+++ b/src/common/node_select.c
@@ -87,6 +87,8 @@ typedef struct slurm_select_ops {
 						List *preemptee_job_list);
 	int		(*job_begin)	       (struct job_record *job_ptr);
 	int		(*job_ready)	       (struct job_record *job_ptr);
+	int		(*job_resized)	       (struct job_record *job_ptr,
+						struct node_record *node_ptr);
 	int		(*job_fini)	       (struct job_record *job_ptr);
 	int		(*job_suspend)	       (struct job_record *job_ptr);
 	int		(*job_resume)	       (struct job_record *job_ptr);
@@ -189,6 +191,7 @@ static slurm_select_ops_t * _select_get_ops(slurm_select_context_t *c)
 		"select_p_job_test",
 		"select_p_job_begin",
 		"select_p_job_ready",
+		"select_p_job_resized",
 		"select_p_job_fini",
 		"select_p_job_suspend",
 		"select_p_job_resume",
@@ -701,6 +704,20 @@ extern int select_g_job_ready(struct job_record *job_ptr)
 	return (*(g_select_context->ops.job_ready))(job_ptr);
 }
 
+/*
+ * Modify internal data structures for a job that has changed size
+ *	Only support jobs shrinking now.
+ * RET: 0 or an error code
+ */
+extern int select_g_job_resized(struct job_record *job_ptr,
+				struct node_record *node_ptr)
+{
+	if (slurm_select_init() < 0)
+		return -1;
+
+	return (*(g_select_context->ops.job_resized))(job_ptr, node_ptr);
+}
+
 /*
  * Note termination of job is starting. Executed from slurmctld.
  * IN job_ptr - pointer to job being terminated
diff --git a/src/common/node_select.h b/src/common/node_select.h
index 3e4c184245f..2a813bb62ec 100644
--- a/src/common/node_select.h
+++ b/src/common/node_select.h
@@ -248,6 +248,14 @@ extern int select_g_job_begin(struct job_record *job_ptr);
  */
 extern int select_g_job_ready(struct job_record *job_ptr);
 
+/*
+ * Modify internal data structures for a job that has changed size
+ *	Only support jobs shrinking now.
+ * RET: 0 or an error code
+ */
+extern int select_g_job_resized(struct job_record *job_ptr,
+				struct node_record *node_ptr);
+
 /*
  * Note termination of job is starting. Executed from slurmctld.
  * IN job_ptr - pointer to job being terminated
diff --git a/src/plugins/select/bluegene/plugin/select_bluegene.c b/src/plugins/select/bluegene/plugin/select_bluegene.c
index 4d1274be4d6..21df7a18dee 100644
--- a/src/plugins/select/bluegene/plugin/select_bluegene.c
+++ b/src/plugins/select/bluegene/plugin/select_bluegene.c
@@ -521,6 +521,12 @@ extern int select_p_job_ready(struct job_record *job_ptr)
 	return block_ready(job_ptr);
 }
 
+extern int select_p_job_resized(struct job_record *job_ptr,
+				struct node_record *node_ptr)
+{
+	return ESLURM_NOT_SUPPORTED;
+}
+
 extern int select_p_job_fini(struct job_record *job_ptr)
 {
 	return term_job(job_ptr);
diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c
index 9b44b96e3af..1ec8751beff 100644
--- a/src/plugins/select/cons_res/select_cons_res.c
+++ b/src/plugins/select/cons_res/select_cons_res.c
@@ -1701,6 +1701,13 @@ extern int select_p_job_ready(struct job_record *job_ptr)
 	return SLURM_SUCCESS;
 }
 
+extern int select_p_job_resized(struct job_record *job_ptr,
+				struct node_record *node_ptr)
+{
+	/* TBD */
+	return SLURM_SUCCESS;
+}
+
 extern int select_p_job_fini(struct job_record *job_ptr)
 {
 	xassert(job_ptr);
diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c
index bfa1bf74e2b..71cf9171cb3 100644
--- a/src/plugins/select/linear/select_linear.c
+++ b/src/plugins/select/linear/select_linear.c
@@ -135,11 +135,15 @@ static bool _rem_tot_job(struct cr_record *cr_ptr, uint32_t job_id);
 static int _rm_job_from_nodes(struct cr_record *cr_ptr,
 			      struct job_record *job_ptr, char *pre_err,
 			      bool remove_all);
+static int _rm_job_from_one_node(struct job_record *job_ptr,
+				 struct node_record *node_ptr, char *pre_err);
 static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 		    uint32_t min_nodes, uint32_t max_nodes,
 		    int max_share, uint32_t req_nodes,
 		    List preemptee_candidates,
 		    List *preemptee_job_list);
+static bool _test_run_job(struct cr_record *cr_ptr, uint32_t job_id);
+static bool _test_tot_job(struct cr_record *cr_ptr, uint32_t job_id);
 static int _test_only(struct job_record *job_ptr, bitstr_t *bitmap,
 			  uint32_t min_nodes, uint32_t max_nodes,
 			  uint32_t req_nodes);
@@ -347,9 +351,8 @@ static void _add_tot_job(struct cr_record *cr_ptr, uint32_t job_id)
 	cr_ptr->tot_job_ids[i] = job_id;
 }
 
-/* Remove job id from record of jobs running,
- * RET true if successful, false if the job was not running */
-static bool _rem_run_job(struct cr_record *cr_ptr, uint32_t job_id)
+static bool _ck_run_job(struct cr_record *cr_ptr, uint32_t job_id, 
+			bool clear_it)
 {
 	int i;
 
@@ -359,15 +362,29 @@ static bool _rem_run_job(struct cr_record *cr_ptr, uint32_t job_id)
 	for (i=0; i<cr_ptr->run_job_len; i++) {
 		if (cr_ptr->run_job_ids[i] != job_id)
 			continue;
-		cr_ptr->run_job_ids[i] = 0;
+		if (clear_it)
+			cr_ptr->run_job_ids[i] = 0;
 		return true;
 	}
 	return false;
 }
 
-/* Remove job id from record of jobs running or suspended,
+/* Remove job id from record of jobs running,
  * RET true if successful, false if the job was not running */
-static bool _rem_tot_job(struct cr_record *cr_ptr, uint32_t job_id)
+static bool _rem_run_job(struct cr_record *cr_ptr, uint32_t job_id)
+{
+	return _ck_run_job(cr_ptr, job_id, true);
+}
+
+/* Test for job id in record of jobs running,
+ * RET true if successful, false if the job was not running */
+static bool _test_run_job(struct cr_record *cr_ptr, uint32_t job_id)
+{
+	return _ck_run_job(cr_ptr, job_id, false);
+}
+
+static bool _ck_tot_job(struct cr_record *cr_ptr, uint32_t job_id,
+			bool clear_it)
 {
 	int i;
 
@@ -377,11 +394,25 @@ static bool _rem_tot_job(struct cr_record *cr_ptr, uint32_t job_id)
 	for (i=0; i<cr_ptr->tot_job_len; i++) {
 		if (cr_ptr->tot_job_ids[i] != job_id)
 			continue;
+		if (clear_it)
 		cr_ptr->tot_job_ids[i] = 0;
-		return true;
+			return true;
 	}
 	return false;
 }
+/* Remove job id from record of jobs running or suspended,
+ * RET true if successful, false if the job was not found */
+static bool _rem_tot_job(struct cr_record *cr_ptr, uint32_t job_id)
+{
+	return _ck_tot_job(cr_ptr, job_id, true);
+}
+
+/* Test for job id in record of jobs running or suspended,
+ * RET true if successful, false if the job was not found */
+static bool _test_tot_job(struct cr_record *cr_ptr, uint32_t job_id)
+{
+	return _ck_tot_job(cr_ptr, job_id, false);
+}
 
 static inline bool _job_preemption_enabled(void)
 {
@@ -1313,7 +1344,8 @@ static int _rm_job_from_nodes(struct cr_record *cr_ptr,
 
 	is_job_running = _rem_run_job(cr_ptr, job_ptr->job_id);
 	for (i = i_first; i <= i_last; i++) {
-		if (bit_test(job_resrcs_ptr->node_bitmap, i) == 0)
+		if (!bit_test(job_resrcs_ptr->node_bitmap, i) ||
+		    !bit_test(job_ptr->node_bitmap, i))
 			continue;
 		if (job_memory_cpu == 0)
 			job_memory = job_memory_node;
@@ -1393,6 +1425,114 @@ static int _rm_job_from_nodes(struct cr_record *cr_ptr,
 	return rc;
 }
 
+/*
+ * deallocate resources that were assigned to this job on one node
+ */
+static int _rm_job_from_one_node(struct job_record *job_ptr,
+				 struct node_record *node_ptr, char *pre_err)
+{
+	int i, rc = SLURM_SUCCESS;
+	struct part_cr_record *part_cr_ptr;
+	job_resources_t *job_resrcs_ptr;
+	uint32_t job_memory, job_memory_cpu = 0, job_memory_node = 0;
+	bool exclusive, is_job_running;
+
+	if (cr_ptr == NULL) {
+		error("%s: cr_ptr not initialized", pre_err);
+		return SLURM_ERROR;
+	}
+
+	if (_test_tot_job(cr_ptr, job_ptr->job_id) == 0) {
+		info("select/linear: job %u has no resources allocated",
+		     job_ptr->job_id);
+		return SLURM_ERROR;
+	}
+
+	i = node_ptr - node_record_table_ptr;
+	exclusive = (job_ptr->details->shared == 0);
+	if (job_ptr->details &&
+	    job_ptr->details->pn_min_memory && (cr_type == CR_MEMORY)) {
+		if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
+			job_memory_cpu = job_ptr->details->pn_min_memory &
+					 (~MEM_PER_CPU);
+		} else
+			job_memory_node = job_ptr->details->pn_min_memory;
+	}
+	if ((job_resrcs_ptr = job_ptr->job_resrcs) == NULL) {
+		error("job %u lacks a job_resources struct", job_ptr->job_id);
+		return SLURM_ERROR;
+	}
+	if (!bit_test(job_resrcs_ptr->node_bitmap, i)) {
+		error("job %u allocated nodes (%s) which have been removed "
+		      "from slurm.conf",
+		      job_ptr->job_id, node_ptr->name);
+		return SLURM_ERROR;
+	}
+
+	is_job_running = _test_run_job(cr_ptr, job_ptr->job_id);
+	if (job_memory_cpu == 0)
+		job_memory = job_memory_node;
+	else if (select_fast_schedule)
+		job_memory = job_memory_cpu * node_ptr->config_ptr->cpus;
+	else
+		job_memory = job_memory_cpu * node_ptr->cpus;
+	if (cr_ptr->nodes[i].alloc_memory >= job_memory)
+		cr_ptr->nodes[i].alloc_memory -= job_memory;
+	else {
+		cr_ptr->nodes[i].alloc_memory = 0;
+		error("%s: memory underflow for node %s",
+			pre_err, node_ptr->name);
+	}
+	if (exclusive) {
+		if (cr_ptr->nodes[i].exclusive_cnt)
+			cr_ptr->nodes[i].exclusive_cnt--;
+		else {
+			error("%s: exclusive_cnt underflow for node %s",
+			      pre_err, node_ptr->name);
+		}
+	}
+	part_cr_ptr = cr_ptr->nodes[i].parts;
+	while (part_cr_ptr) {
+		if (part_cr_ptr->part_ptr != job_ptr->part_ptr) {
+			part_cr_ptr = part_cr_ptr->next;
+			continue;
+		}
+		if (!is_job_running)
+			/* cancelled job already suspended */;
+		else if (part_cr_ptr->run_job_cnt > 0)
+			part_cr_ptr->run_job_cnt--;
+		else {
+			error("%s: run_job_cnt underflow for node %s",
+			       pre_err, node_ptr->name);
+		}
+		if (part_cr_ptr->tot_job_cnt > 0)
+			part_cr_ptr->tot_job_cnt--;
+		else {
+			error("%s: tot_job_cnt underflow for node %s",
+			      pre_err, node_ptr->name);
+		}
+		if ((part_cr_ptr->tot_job_cnt == 0) &&
+		    (part_cr_ptr->run_job_cnt)) {
+			part_cr_ptr->run_job_cnt = 0;
+			error("%s: run_job_count out of sync for node %s",
+			      pre_err, node_ptr->name);
+		}
+		break;
+	}
+	if (part_cr_ptr == NULL) {
+		if (job_ptr->part_ptr) {
+			error("%s: Could not find partition %s for node %s",
+			      pre_err, job_ptr->part_ptr->name, node_ptr->name);
+		} else {
+			error("%s: no partition ptr given for job %u and node %s",
+			      pre_err, job_ptr->job_id, node_ptr->name);
+		}
+		rc = SLURM_ERROR;
+	}
+
+	return rc;
+}
+
 /*
  * allocate resources to the given job
  *
@@ -1437,7 +1577,8 @@ static int _add_job_to_nodes(struct cr_record *cr_ptr,
 	i_first = bit_ffs(job_resrcs_ptr->node_bitmap);
 	i_last  = bit_fls(job_resrcs_ptr->node_bitmap);
 	for (i=i_first; ((i<=i_last) && (i_first>=0)); i++) {
-		if (bit_test(job_resrcs_ptr->node_bitmap, i) == 0)
+		if (!bit_test(job_resrcs_ptr->node_bitmap, i) ||
+		    !bit_test(job_ptr->node_bitmap, i))
 			continue;
 		if (job_memory_cpu == 0)
 			cr_ptr->nodes[i].alloc_memory += job_memory_node;
@@ -2188,6 +2329,33 @@ extern int select_p_job_ready(struct job_record *job_ptr)
 	return 1;
 }
 
+extern int select_p_job_resized(struct job_record *job_ptr,
+				struct node_record *node_ptr)
+{
+	int rc = SLURM_SUCCESS;
+#ifdef HAVE_XCPU
+	int i = node_ptr - node_record_table_ptr;
+	char clone_path[128];
+
+	if (bit_test(job_ptr->node_bitmap, i) == 0)
+		continue;
+	snprintf(clone_path, sizeof(clone_path), "%s/%s/xcpu/clone", XCPU_DIR,
+		 node_ptr->name);
+	if (chown(clone_path, (uid_t)0, (gid_t)0)) {
+		error("chown %s: %m", clone_path);
+		rc = SLURM_ERROR;
+	} else
+		debug("chown %s to 0", clone_path);
+#endif
+
+	slurm_mutex_lock(&cr_mutex);
+	if (cr_ptr == NULL)
+		_init_node_cr();
+	_rm_job_from_one_node(job_ptr, node_ptr, "select_p_job_resized");
+	slurm_mutex_unlock(&cr_mutex);
+	return rc;
+}
+
 extern int select_p_job_fini(struct job_record *job_ptr)
 {
 	int rc = SLURM_SUCCESS;
@@ -2198,9 +2366,8 @@ extern int select_p_job_fini(struct job_record *job_ptr)
 	for (i=0; i<select_node_cnt; i++) {
 		if (bit_test(job_ptr->node_bitmap, i) == 0)
 			continue;
-		snprintf(clone_path, sizeof(clone_path),
-			"%s/%s/xcpu/clone", XCPU_DIR,
-			select_node_ptr[i].name);
+		snprintf(clone_path, sizeof(clone_path), "%s/%s/xcpu/clone",
+			 XCPU_DIR, select_node_ptr[i].name);
 		if (chown(clone_path, (uid_t)0, (gid_t)0)) {
 			error("chown %s: %m", clone_path);
 			rc = SLURM_ERROR;
diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c
index 47c2af00913..2ea03f68f19 100644
--- a/src/scontrol/update_job.c
+++ b/src/scontrol/update_job.c
@@ -637,12 +637,16 @@ static void _update_job_size(uint32_t job_id)
 
 	xstrfmtcat(fname_csh, "slurm_job_%u_resize.csh", job_id);
 	xstrfmtcat(fname_sh,  "slurm_job_%u_resize.sh", job_id);
+	(void) unlink(fname_csh);
+	(void) unlink(fname_sh);
  	if (!(resize_csh = fopen(fname_csh, "w"))) {
-		fprintf(stderr, "Could not create file %s", fname_csh);
+		fprintf(stderr, "Could not create file %s: %s\n", fname_csh, 
+			strerror(errno));
 		goto fini;
 	}
  	if (!(resize_sh = fopen(fname_sh, "w"))) {
-		fprintf(stderr, "Could not create file %s", fname_sh);
+		fprintf(stderr, "Could not create file %s: %s\n", fname_sh, 
+			strerror(errno));
 		goto fini;
 	}
 	chmod(fname_csh, 0500);	/* Make file executable */
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 65df4d8aefa..f80a2cd41ea 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -1866,6 +1866,7 @@ extern void excise_node_from_job(struct job_record *job_ptr,
 	}
 	job_ptr->node_cnt = new_pos + 1;
 	bit_free(orig_bitmap);
+	(void) select_g_job_resized(job_ptr, node_ptr);
 }
 
 /*
@@ -5991,6 +5992,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 		}
 	}
 
+#ifndef HAVE_BG
 	if (job_specs->req_nodes && 
 	    (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
 		/* Use req_nodes to change the nodes associated with a running
@@ -6024,6 +6026,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 		xfree(job_specs->req_nodes);
 		update_accounting = true;
 	}
+#endif
 
 	if (job_specs->req_nodes) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
@@ -6055,6 +6058,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 		}
 	}
 
+#ifndef HAVE_BG
 	if ((job_specs->min_nodes != NO_VAL) &&
 	    (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
 		/* Use req_nodes to change the nodes associated with a running
@@ -6092,6 +6096,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 		}
 		update_accounting = true;
 	}
+#endif
 
 	if (job_specs->ntasks_per_node != (uint16_t) NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
-- 
GitLab