From 43378a5ea35efb0188e642d88c01d47eff9d2d71 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Mon, 20 Sep 2004 16:42:39 +0000
Subject: [PATCH] Restructure some code so that the bgl_part_id can be
 picked-up and   reported as desired. Add new function to drain node (for use
 by select/bluegene node   monitoring thread).

---
 src/slurmctld/job_mgr.c   |  99 +++++----------------
 src/slurmctld/node_mgr.c  |   2 +-
 src/slurmctld/proc_req.c  | 181 ++++++++++++++++++--------------------
 src/slurmctld/slurmctld.h |  39 ++++----
 src/slurmctld/srun_comm.c |   2 +-
 5 files changed, 126 insertions(+), 197 deletions(-)

diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 94856abe7bb..f5ee6acc544 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -97,8 +97,7 @@ static void _excise_node_from_job(struct job_record *job_ptr,
 static int  _find_batch_dir(void *x, void *key);
 static void _get_batch_job_dir_ids(List batch_dirs);
 static void _job_timed_out(struct job_record *job_ptr);
-static int  _job_create(job_desc_msg_t * job_specs, uint32_t * new_job_id,
-		        int allocate, int will_run,
+static int  _job_create(job_desc_msg_t * job_specs, int allocate, int will_run,
 		        struct job_record **job_rec_ptr, uid_t submit_uid);
 static void _list_delete_job(void *job_entry);
 static int  _list_find_job_id(void *job_entry, void *key);
@@ -1237,21 +1236,12 @@ extern void rehash_jobs(void)
  * job_allocate - create job_records for the suppied job specification and 
  *	allocate nodes for it.
  * IN job_specs - job specifications
- * IN node_list - location for storing new job's allocated nodes
  * IN immediate - if set then either initiate the job immediately or fail
  * IN will_run - don't initiate the job if set, just test if it could run 
  *	now or later
  * IN allocate - resource allocation request if set, not a full job
- * OUT new_job_id - the new job's ID
- * OUT num_cpu_groups - number of cpu groups (elements in cpus_per_node 
- *	and cpu_count_reps)
- * OUT cpus_per_node - pointer to array of numbers of cpus on each node 
- *	allocate
- * OUT cpu_count_reps - pointer to array of numbers of consecutive nodes 
- *	having same cpu count
- * OUT node_list - list of nodes allocated to the job
- * OUT node_cnt - number of allocated nodes
- * OUT node_addr - slurm_addr's for the allocated nodes
+ * IN submit_uid -uid of user issuing the request
+ * OUT job_pptr - set to pointer to job record
  * RET 0 or an error code. If the job would only be able to execute with 
  *	some change in partition configuration then 
  *	ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned
@@ -1263,18 +1253,16 @@ extern void rehash_jobs(void)
  *	default_part_loc - pointer to default partition
  * NOTE: lock_slurmctld on entry: Read config Write job, Write node, Read part
  */
-int job_allocate(job_desc_msg_t * job_specs, uint32_t * new_job_id,
-		 char **node_list, uint16_t * num_cpu_groups,
-		 uint32_t ** cpus_per_node, uint32_t ** cpu_count_reps,
-		 int immediate, int will_run, int allocate,
-		 uid_t submit_uid, uint16_t * node_cnt,
-		 slurm_addr ** node_addr)
+extern int job_allocate(job_desc_msg_t * job_specs, int immediate, int will_run, 
+		int allocate, uid_t submit_uid, struct job_record **job_pptr)
 {
 	int error_code;
 	bool no_alloc, top_prio, test_only, too_fragmented, independent;
 	struct job_record *job_ptr;
-	error_code = _job_create(job_specs, new_job_id, allocate, will_run,
+	error_code = _job_create(job_specs, allocate, will_run,
 				 &job_ptr, submit_uid);
+	*job_pptr = job_ptr;
+
 	if (error_code) {
 		if (immediate && job_ptr) {
 			job_ptr->job_state = JOB_FAILED;
@@ -1284,9 +1272,7 @@ int job_allocate(job_desc_msg_t * job_specs, uint32_t * new_job_id,
 		}
 		return error_code;
 	}
-	if (job_ptr == NULL)
-		fatal("job_allocate: allocated job %u lacks record",
-		      new_job_id);
+	xassert(job_ptr);
 
 	independent = job_independent(job_ptr);
 
@@ -1322,22 +1308,8 @@ int job_allocate(job_desc_msg_t * job_specs, uint32_t * new_job_id,
 	}
 
 	test_only = will_run || (allocate == 0);
-	if (!test_only) {
-		/* Some of these pointers are NULL on submit */
-		if (num_cpu_groups)
-			*num_cpu_groups = 0;
-		if (node_list)
-			*node_list = NULL;
-		if (cpus_per_node)
-			*cpus_per_node = NULL;
-		if (cpu_count_reps)
-			*cpu_count_reps = NULL;
-		if (node_cnt)
-			*node_cnt = 0;
-		if (node_addr)
-			*node_addr = (slurm_addr *) NULL;
+	if (!test_only)
 		last_job_update = time(NULL);
-	}
 
 	no_alloc = test_only || too_fragmented || 
 			(!top_prio) || (!independent);
@@ -1371,21 +1343,6 @@ int job_allocate(job_desc_msg_t * job_specs, uint32_t * new_job_id,
 		job_ptr->end_time   = 0;
 	}
 
-	if (!no_alloc) {
-		if (node_list)
-			*node_list = job_ptr->nodes;
-		if (num_cpu_groups)
-			*num_cpu_groups = job_ptr->num_cpu_groups;
-		if (cpus_per_node)
-			*cpus_per_node = job_ptr->cpus_per_node;
-		if (cpu_count_reps)
-			*cpu_count_reps = job_ptr->cpu_count_reps;
-		if (node_cnt)
-			*node_cnt = job_ptr->node_cnt;
-		if (node_addr)
-			*node_addr = job_ptr->node_addr;
-	}
-
 	return SLURM_SUCCESS;
 }
 
@@ -1575,7 +1532,6 @@ job_complete(uint32_t job_id, uid_t uid, bool requeue,
  * input: job_specs - job specifications
  * IN allocate - resource allocation request if set rather than job submit
  * IN will_run - job is not to be created, test of validity only
- * OUT new_job_id - the job's ID
  * OUT job_pptr - pointer to the job (NULL on error)
  * RET 0 on success, otherwise ESLURM error code. If the job would only be
  *	able to execute with some change in partition configuration then
@@ -1586,8 +1542,7 @@ job_complete(uint32_t job_id, uid_t uid, bool requeue,
  *	job_hash - hash table into job records
  */
 
-static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id,
-		       int allocate, int will_run,
+static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run,
 		       struct job_record **job_pptr, uid_t submit_uid)
 {
 	int error_code = SLURM_SUCCESS, i;
@@ -1760,7 +1715,6 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id,
 		(*job_pptr)->batch_flag = 1;
 	} else
 		(*job_pptr)->batch_flag = 0;
-	*new_job_id = (*job_pptr)->job_id;
 
 	/* Insure that requested partition is valid right now, 
 	 * otherwise leave job queued and provide warning code */
@@ -1772,19 +1726,20 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id,
 	if ((!super_user) && 
 	    (job_desc->min_nodes > part_ptr->max_nodes)) {
 		info("Job %u requested too many nodes (%d) of "
-			"partition %s(%d)", *new_job_id, job_desc->min_nodes, 
+			"partition %s(%d)", 
+			(*job_pptr)->job_id, job_desc->min_nodes, 
 			part_ptr->name, part_ptr->max_nodes);
 		fail_reason = WAIT_PART_NODE_LIMIT;
 	} else if ((!super_user) &&
 	           (job_desc->max_nodes != 0) &&    /* no max_nodes for job */
 		   (job_desc->max_nodes < part_ptr->min_nodes)) {
 		info("Job %u requested too few nodes (%d) of partition %s(%d)",
-		     *new_job_id, job_desc->max_nodes, 
-		     part_ptr->name, part_ptr->min_nodes);
+			(*job_pptr)->job_id, job_desc->max_nodes, 
+			part_ptr->name, part_ptr->min_nodes);
 		fail_reason = WAIT_PART_NODE_LIMIT;
 	} else if (part_ptr->state_up == 0) {
 		info("Job %u requested down partition %s", 
-		     *new_job_id, part_ptr->name);
+			(*job_pptr)->job_id, part_ptr->name);
 		fail_reason = WAIT_PART_STATE;
 	}
 	if (fail_reason != WAIT_NO_REASON) {
@@ -3382,13 +3337,10 @@ kill_job_on_node(uint32_t job_id, struct node_record *node_ptr)
  * old_job_info - get details about an existing job allocation
  * IN uid - job issuing the code
  * IN job_id - ID of job for which info is requested
- * OUT everything else - the job's details
+ * OUT job_pptr - set to pointer to job record
  */
-int
-old_job_info(uint32_t uid, uint32_t job_id, char **node_list,
-	     uint16_t * num_cpu_groups, uint32_t ** cpus_per_node,
-	     uint32_t ** cpu_count_reps, uint16_t * node_cnt,
-	     slurm_addr ** node_addr)
+extern int
+old_job_info(uint32_t uid, uint32_t job_id, struct job_record **job_pptr)
 {
 	struct job_record *job_ptr;
 
@@ -3403,18 +3355,7 @@ old_job_info(uint32_t uid, uint32_t job_id, char **node_list,
 	if (IS_JOB_FINISHED(job_ptr))
 		return ESLURM_ALREADY_DONE;
 
-	if (node_list)
-		*node_list = job_ptr->nodes;
-	if (num_cpu_groups)
-		*num_cpu_groups = job_ptr->num_cpu_groups;
-	if (cpus_per_node)
-		*cpus_per_node = job_ptr->cpus_per_node;
-	if (cpu_count_reps)
-		*cpu_count_reps = job_ptr->cpu_count_reps;
-	if (node_cnt)
-		*node_cnt = job_ptr->node_cnt;
-	if (node_addr)
-		*node_addr = job_ptr->node_addr;
+	*job_pptr = job_ptr;
 	return SLURM_SUCCESS;
 }
 
diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
index a6a26950583..de974696ecd 100644
--- a/src/slurmctld/node_mgr.c
+++ b/src/slurmctld/node_mgr.c
@@ -841,7 +841,7 @@ void set_slurmd_addr (void)
 /* 
  * update_node - update the configuration data for one or more nodes
  * IN update_node_msg - update node request
- * RET 0 or error code
+ * RET SLURM_SUCCESS or error code
  * global: node_record_table_ptr - pointer to global node table
  */
 int update_node ( update_node_msg_t * update_node_msg ) 
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index 56e71ae7907..a3f0a411b71 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -358,19 +358,14 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg)
 	slurm_msg_t response_msg;
 	DEF_TIMERS;
 	job_desc_msg_t *job_desc_msg = (job_desc_msg_t *) msg->data;
-	char *node_list_ptr = NULL;
-	uint16_t num_cpu_groups = 0;
-	uint32_t *cpus_per_node = NULL, *cpu_count_reps = NULL;
-	uint32_t job_id = 0;
 	resource_allocation_response_msg_t alloc_msg;
 	/* Locks: Read config, write job, write node, read partition */
 	slurmctld_lock_t job_write_lock = { 
 		READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK };
 	uid_t uid;
-	uint16_t node_cnt = 0;
-	slurm_addr *node_addr = NULL;
 	int immediate = job_desc_msg->immediate;
 	bool do_unlock = false;
+	struct job_record *job_ptr;
 
 	START_TIMER;
 	debug2("Processing RPC: REQUEST_RESOURCE_ALLOCATION");
@@ -387,11 +382,8 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg)
 	if (error_code == SLURM_SUCCESS) {
 		do_unlock = true;
 		lock_slurmctld(job_write_lock);
-		error_code = job_allocate(job_desc_msg, &job_id,
-					  &node_list_ptr, &num_cpu_groups,
-					  &cpus_per_node, &cpu_count_reps,
-					  immediate, false, true, uid,
-					  &node_cnt, &node_addr);
+		error_code = job_allocate(job_desc_msg,
+				immediate, false, true, uid, &job_ptr);
 		/* unlock after finished using the job structure data */
 		END_TIMER;
 	}
@@ -400,29 +392,30 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg)
 	if ((error_code == SLURM_SUCCESS) ||
 	    ((immediate == 0) && 
 	     (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE))) {
+		xassert(job_ptr);
 		info("_slurm_rpc_allocate_resources JobId=%u NodeList=%s %s",
-			job_id, node_list_ptr, TIME_STR);
+			job_ptr->job_id, job_ptr->nodes, TIME_STR);
 
 		/* send job_ID  and node_name_ptr */
 		alloc_msg.cpu_count_reps = xmalloc(sizeof(uint32_t) *
-				num_cpu_groups);
-		memcpy(alloc_msg.cpu_count_reps, cpu_count_reps,
-				(sizeof(uint32_t) * num_cpu_groups));
+				job_ptr->num_cpu_groups);
+		memcpy(alloc_msg.cpu_count_reps, job_ptr->cpu_count_reps,
+				(sizeof(uint32_t) * job_ptr->num_cpu_groups));
 		alloc_msg.cpus_per_node  = xmalloc(sizeof(uint32_t) *
-				num_cpu_groups);
-		memcpy(alloc_msg.cpus_per_node, cpus_per_node,
-				(sizeof(uint32_t) * num_cpu_groups));
+				job_ptr->num_cpu_groups);
+		memcpy(alloc_msg.cpus_per_node, job_ptr->cpus_per_node,
+				(sizeof(uint32_t) * job_ptr->num_cpu_groups));
 		alloc_msg.error_code     = error_code;
-		alloc_msg.job_id         = job_id;
+		alloc_msg.job_id         = job_ptr->job_id;
 		alloc_msg.node_addr      = xmalloc(sizeof(slurm_addr) *
-				node_cnt);
-		memcpy(alloc_msg.node_addr, node_addr, 
-				(sizeof(slurm_addr) * node_cnt));
-		alloc_msg.node_cnt       = node_cnt;
-		alloc_msg.node_list      = xstrdup(node_list_ptr);
-		alloc_msg.num_cpu_groups = num_cpu_groups;
+				job_ptr->node_cnt);
+		memcpy(alloc_msg.node_addr, job_ptr->node_addr, 
+				(sizeof(slurm_addr) * job_ptr->node_cnt));
+		alloc_msg.node_cnt       = job_ptr->node_cnt;
+		alloc_msg.node_list      = xstrdup(job_ptr->nodes);
+		alloc_msg.num_cpu_groups = job_ptr->num_cpu_groups;
 #ifdef HAVE_BGL
-		alloc_msg.bgl_part_id    = xstrdup(DEFAULT_BGL_PART_ID);
+		alloc_msg.bgl_part_id    = xstrdup(job_ptr->bgl_part_id);
 #endif
 		unlock_slurmctld(job_write_lock);
 
@@ -430,7 +423,7 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg)
 		response_msg.data = &alloc_msg;
 
 		if (slurm_send_node_msg(msg->conn_fd, &response_msg) < 0)
-			_kill_job_on_msg_fail(job_id);
+			_kill_job_on_msg_fail(job_ptr->job_id);
 		xfree(alloc_msg.cpu_count_reps);
 		xfree(alloc_msg.cpus_per_node);
 		xfree(alloc_msg.node_addr);
@@ -458,20 +451,15 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 	slurm_msg_t response_msg;
 	DEF_TIMERS;
 	job_desc_msg_t *job_desc_msg = (job_desc_msg_t *) msg->data;
-	char *node_list_ptr = NULL;
-	uint16_t num_cpu_groups = 0;
-	uint32_t *cpus_per_node = NULL, *cpu_count_reps = NULL;
-	uint32_t job_id;
 	resource_allocation_and_run_response_msg_t alloc_msg;
 	struct step_record *step_rec;
+	struct job_record *job_ptr;
 	slurm_cred_t slurm_cred;
 	job_step_create_request_msg_t req_step_msg;
 	/* Locks: Write job, write node, read partition */
 	slurmctld_lock_t job_write_lock = { 
 		NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK };
 	uid_t uid;
-	uint16_t node_cnt;
-	slurm_addr *node_addr;
 	int immediate = true;   /* implicit job_desc_msg->immediate == true */
 
 	START_TIMER;
@@ -497,11 +485,8 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 #endif
 
 	lock_slurmctld(job_write_lock);
-	error_code = job_allocate(job_desc_msg, &job_id,
-				  &node_list_ptr, &num_cpu_groups,
-				  &cpus_per_node, &cpu_count_reps,
-				  immediate, false, true, uid,
-				  &node_cnt, &node_addr);
+	error_code = job_allocate(job_desc_msg, 
+			immediate, false, true, uid, &job_ptr);
 
 	/* return result */
 	if (error_code) {
@@ -512,7 +497,7 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 		return;
 	}
 
-	req_step_msg.job_id     = job_id;
+	req_step_msg.job_id     = job_ptr->job_id;
 	req_step_msg.user_id    = job_desc_msg->user_id;
 #ifdef HAVE_BGL
 	req_step_msg.node_count = 1;
@@ -531,7 +516,7 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 
 	/* note: no need to free step_rec, pointer to global job step record */
 	if (error_code) {
-		job_complete(job_id, job_desc_msg->user_id, false, 0);
+		job_complete(job_ptr->job_id, job_desc_msg->user_id, false, 0);
 		unlock_slurmctld(job_write_lock);
 		info("_slurm_rpc_allocate_and_run creating job step: %s",
 			slurm_strerror(error_code));
@@ -539,17 +524,17 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 	} else {
 
 		info("_slurm_rpc_allocate_and_run JobId=%u NodeList=%s %s", 
-			job_id, node_list_ptr, TIME_STR);
+			job_ptr->job_id, job_ptr->nodes, TIME_STR);
 
 		/* send job_ID  and node_name_ptr */
-		alloc_msg.job_id         = job_id;
-		alloc_msg.node_list      = node_list_ptr;
-		alloc_msg.num_cpu_groups = num_cpu_groups;
-		alloc_msg.cpus_per_node  = cpus_per_node;
-		alloc_msg.cpu_count_reps = cpu_count_reps;
+		alloc_msg.job_id         = job_ptr->job_id;
+		alloc_msg.node_list      = job_ptr->nodes;
+		alloc_msg.num_cpu_groups = job_ptr->num_cpu_groups;
+		alloc_msg.cpus_per_node  = job_ptr->cpus_per_node;
+		alloc_msg.cpu_count_reps = job_ptr->cpu_count_reps;
 		alloc_msg.job_step_id    = step_rec->step_id;
-		alloc_msg.node_cnt       = node_cnt;
-		alloc_msg.node_addr      = node_addr;
+		alloc_msg.node_cnt       = job_ptr->node_cnt;
+		alloc_msg.node_addr      = job_ptr->node_addr;
 		alloc_msg.cred           = slurm_cred;
 		alloc_msg.switch_job     = switch_copy_jobinfo(
 						step_rec->switch_job);
@@ -558,7 +543,7 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 		response_msg.data = &alloc_msg;
 
 		if (slurm_send_node_msg(msg->conn_fd, &response_msg) < 0)
-			_kill_job_on_msg_fail(job_id);
+			_kill_job_on_msg_fail(job_ptr->job_id);
 		slurm_cred_destroy(slurm_cred);
 		switch_free_jobinfo(alloc_msg.switch_job);
 		schedule_job_save();	/* has own locks */
@@ -1089,11 +1074,8 @@ static void _slurm_rpc_job_will_run(slurm_msg_t * msg)
 	/* init */
 	DEF_TIMERS;
 	int error_code = SLURM_SUCCESS;
-	uint16_t num_cpu_groups = 0;
-	uint32_t *cpus_per_node = NULL, *cpu_count_reps = NULL;
-	uint32_t job_id;
+	struct job_record *job_ptr;
 	job_desc_msg_t *job_desc_msg = (job_desc_msg_t *) msg->data;
-	char *node_list_ptr = NULL;
 	/* Locks: Write job, read node, read partition */
 	slurmctld_lock_t job_write_lock = { 
 		NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
@@ -1113,11 +1095,8 @@ static void _slurm_rpc_job_will_run(slurm_msg_t * msg)
 
 	if (error_code == SLURM_SUCCESS) {
 		lock_slurmctld(job_write_lock);
-		error_code = job_allocate(job_desc_msg, &job_id,
-					  &node_list_ptr, &num_cpu_groups,
-					  &cpus_per_node, &cpu_count_reps,
-					  false, true, true, uid, NULL,
-					  NULL);
+		error_code = job_allocate(job_desc_msg, 
+				false, true, true, uid, &job_ptr);
 		unlock_slurmctld(job_write_lock);
 		END_TIMER;
 	}
@@ -1201,18 +1180,14 @@ static void _slurm_rpc_old_job_alloc(slurm_msg_t * msg)
 {
 	int error_code = SLURM_SUCCESS;
 	slurm_msg_t response_msg;
+	struct job_record *job_ptr;
 	DEF_TIMERS;
 	old_job_alloc_msg_t *job_desc_msg =
 	    (old_job_alloc_msg_t *) msg->data;
-	char *node_list_ptr = NULL;
-	uint16_t num_cpu_groups = 0;
-	uint32_t *cpus_per_node = NULL, *cpu_count_reps = NULL;
 	resource_allocation_response_msg_t alloc_msg;
 	/* Locks: Read job, read node */
 	slurmctld_lock_t job_read_lock = { 
 		NO_LOCK, READ_LOCK, READ_LOCK, NO_LOCK };
-	uint16_t node_cnt;
-	slurm_addr *node_addr;
 	uid_t uid;
 	bool do_unlock = false;
 
@@ -1230,15 +1205,12 @@ static void _slurm_rpc_old_job_alloc(slurm_msg_t * msg)
 		do_unlock = true;
 		lock_slurmctld(job_read_lock);
 		error_code = old_job_info(job_desc_msg->uid,
-					  job_desc_msg->job_id,
-					  &node_list_ptr, &num_cpu_groups,
-					  &cpus_per_node, &cpu_count_reps,
-					  &node_cnt, &node_addr);
+					  job_desc_msg->job_id, &job_ptr);
 		END_TIMER;
 	}
 
 	/* return result */
-	if (error_code) {
+	if (error_code || (job_ptr == NULL)) {
 		if (do_unlock)
 			unlock_slurmctld(job_read_lock);
 		debug2("_slurm_rpc_old_job_alloc: JobId=%u, uid=%u: %s",
@@ -1247,28 +1219,29 @@ static void _slurm_rpc_old_job_alloc(slurm_msg_t * msg)
 		slurm_send_rc_msg(msg, error_code);
 	} else {
 		debug2("_slurm_rpc_old_job_alloc JobId=%u NodeList=%s %s",
-			job_desc_msg->job_id, node_list_ptr, TIME_STR);
+			job_desc_msg->job_id, job_ptr->nodes, TIME_STR);
 
 		/* send job_ID  and node_name_ptr */
 		alloc_msg.cpu_count_reps = xmalloc(sizeof(uint32_t) *
-				num_cpu_groups);
-		memcpy(alloc_msg.cpu_count_reps, cpu_count_reps,
-				(sizeof(uint32_t) * num_cpu_groups));
+				job_ptr->num_cpu_groups);
+		memcpy(alloc_msg.cpu_count_reps, 
+				job_ptr->cpu_count_reps,
+				(sizeof(uint32_t) * job_ptr->num_cpu_groups));
 		alloc_msg.cpus_per_node  = xmalloc(sizeof(uint32_t) *
-				num_cpu_groups);
-		memcpy(alloc_msg.cpus_per_node, cpus_per_node,
-				(sizeof(uint32_t) * num_cpu_groups));
+				job_ptr->num_cpu_groups);
+		memcpy(alloc_msg.cpus_per_node, job_ptr->cpus_per_node,
+				(sizeof(uint32_t) * job_ptr->num_cpu_groups));
 		alloc_msg.error_code     = error_code;
 		alloc_msg.job_id         = job_desc_msg->job_id;
 		alloc_msg.node_addr      = xmalloc(sizeof(slurm_addr) *
-				node_cnt);
-		memcpy(alloc_msg.node_addr, node_addr,
-				(sizeof(slurm_addr) * node_cnt));
-		alloc_msg.node_cnt       = node_cnt;
-		alloc_msg.node_list      = xstrdup(node_list_ptr);
-		alloc_msg.num_cpu_groups = num_cpu_groups;
+				job_ptr->node_cnt);
+		memcpy(alloc_msg.node_addr, job_ptr->node_addr,
+				(sizeof(slurm_addr) * job_ptr->node_cnt));
+		alloc_msg.node_cnt       = job_ptr->node_cnt;
+		alloc_msg.node_list      = xstrdup(job_ptr->nodes);
+		alloc_msg.num_cpu_groups = job_ptr->num_cpu_groups;
 #ifdef HAVE_BGL
-		alloc_msg.bgl_part_id    = xstrdup(DEFAULT_BGL_PART_ID);
+		alloc_msg.bgl_part_id    = xstrdup(job_ptr->bgl_part_id);
 #endif
 		unlock_slurmctld(job_read_lock);
 
@@ -1444,7 +1417,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 	/* init */
 	int error_code = SLURM_SUCCESS;
 	DEF_TIMERS;
-	uint32_t job_id;
+	struct job_record *job_ptr;
 	slurm_msg_t response_msg;
 	submit_response_msg_t submit_msg;
 	job_desc_msg_t *job_desc_msg = (job_desc_msg_t *) msg->data;
@@ -1467,12 +1440,8 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 	}
 	if (error_code == SLURM_SUCCESS) {
 		lock_slurmctld(job_write_lock);
-		error_code = job_allocate(job_desc_msg, &job_id,
-					  (char **) NULL,
-					  (uint16_t *) NULL,
-					  (uint32_t **) NULL,
-					  (uint32_t **) NULL, false, false,
-					  false, uid, NULL, NULL);
+		error_code = job_allocate(job_desc_msg, false, false,
+					  false, uid, &job_ptr);
 		unlock_slurmctld(job_write_lock);
 		END_TIMER;
 	}
@@ -1481,14 +1450,13 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 	if ((error_code != SLURM_SUCCESS) &&
 	    (error_code != ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)) {
 		info("_slurm_rpc_submit_batch_job: %s",
-		     slurm_strerror(error_code));
+			slurm_strerror(error_code));
 		slurm_send_rc_msg(msg, error_code);
 	} else {
-		info(
-		   "_slurm_rpc_submit_batch_job JobId=%u %s", 
-		   job_id, TIME_STR);
+		info("_slurm_rpc_submit_batch_job JobId=%u %s", 
+			job_ptr->job_id, TIME_STR);
 		/* send job_ID */
-		submit_msg.job_id     = job_id;
+		submit_msg.job_id     = job_ptr->job_id;
 		submit_msg.error_code = error_code;
 		response_msg.msg_type = RESPONSE_SUBMIT_BATCH_JOB;
 		response_msg.data = &submit_msg;
@@ -1538,6 +1506,31 @@ static void _slurm_rpc_update_job(slurm_msg_t * msg)
 	}
 }
 
+/*
+ * slurm_drain_nodes - process a request to drain a list of nodes
+ * node_list IN - list of nodes to drain
+ * reason IN - reason to drain the nodes
+ * RET SLURM_SUCCESS or error code
+ */
+extern int slurm_drain_nodes(char *node_list, char *reason)
+{
+	int error_code;
+	update_node_msg_t update_node_msg;
+	/* Locks: Write  node */
+	slurmctld_lock_t node_write_lock = { 
+		NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK };
+
+	update_node_msg.node_names = node_list;
+	update_node_msg.node_state = NODE_STATE_DRAINED;
+	update_node_msg.reason = reason;
+
+	lock_slurmctld(node_write_lock);
+	error_code = update_node(&update_node_msg);
+	unlock_slurmctld(node_write_lock);
+
+	return error_code;
+}
+
 /* _slurm_rpc_update_node - process RPC to update the configuration of a 
  *	node (e.g. UP/DOWN) */
 static void _slurm_rpc_update_node(slurm_msg_t * msg)
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 641aebc5b7f..77a85b7b3a2 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -609,21 +609,12 @@ extern bool is_node_resp (char *name);
  * job_allocate - create job_records for the suppied job specification and 
  *	allocate nodes for it.
  * IN job_specs - job specifications
- * IN node_list - location for storing new job's allocated nodes
  * IN immediate - if set then either initiate the job immediately or fail
  * IN will_run - don't initiate the job if set, just test if it could run 
  *	now or later
  * IN allocate - resource allocation request if set, not a full job
- * OUT new_job_id - the new job's ID
- * OUT num_cpu_groups - number of cpu groups (elements in cpus_per_node 
- *	and cpu_count_reps)
- * OUT cpus_per_node - pointer to array of numbers of cpus on each node 
- *	allocate
- * OUT cpu_count_reps - pointer to array of numbers of consecutive nodes 
- *	having same cpu count
- * OUT node_list - list of nodes allocated to the job
- * OUT node_cnt - number of allocated nodes
- * OUT node_addr - slurm_addr's for the allocated nodes
+ * IN submit_uid -uid of user issuing the request
+ * OUT job_pptr - set to pointer to job record
  * RET 0 or an error code. If the job would only be able to execute with 
  *	some change in partition configuration then 
  *	ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned
@@ -632,13 +623,11 @@ extern bool is_node_resp (char *name);
  *	and cpu_count_reps={4,2,2}
  * globals: job_list - pointer to global job list 
  *	list_part - global list of partition info
- *	default_part_loc - pointer to default partition 
+ *	default_part_loc - pointer to default partition
+ * NOTE: lock_slurmctld on entry: Read config Write job, Write node, Read part
  */
-extern int job_allocate(job_desc_msg_t * job_specs, uint32_t * new_job_id,
-	     char **node_list, uint16_t * num_cpu_groups,
-	     uint32_t ** cpus_per_node, uint32_t ** cpu_count_reps,
-	     int immediate, int will_run, int allocate, uid_t submit_uid,
-	     uint16_t * node_cnt, slurm_addr ** node_addr);
+extern int job_allocate(job_desc_msg_t * job_specs, int immediate, int will_run, 
+		int allocate, uid_t submit_uid, struct job_record **job_pptr);
 
 /* log the completion of the specified job */
 extern void job_completion_logger(struct job_record  *job_ptr);
@@ -887,12 +876,10 @@ extern void node_not_resp (char *name, time_t msg_time);
  * old_job_info - get details about an existing job allocation
  * IN uid - job issuing the code
  * IN job_id - ID of job for which info is requested
- * OUT everything else - the job's detains
+ * OUT job_pptr - set to pointer to job record
  */
-extern int old_job_info (uint32_t uid, uint32_t job_id, char **node_list, 
-	uint16_t * num_cpu_groups, uint32_t ** cpus_per_node, 
-	uint32_t ** cpu_count_reps,
-	uint16_t * node_cnt, slurm_addr ** node_addr);
+extern int old_job_info(uint32_t uid, uint32_t job_id, 
+		struct job_record **job_pptr);
 
 
 /* 
@@ -1104,6 +1091,14 @@ extern void signal_step_tasks(struct step_record *step_ptr, uint16_t signal);
  */
 extern int slurmctld_shutdown(void);
 
+/*
+ * slurm_drain_nodes - process a request to drain a list of nodes
+ * node_list IN - list of nodes to drain
+ * reason IN - reason to drain the nodes
+ * RET SLURM_SUCCESS or error code
+ */
+extern int slurm_drain_nodes(char *node_list, char *reason);
+
 /*
  * step_create - creates a step_record in step_specs->job_id, sets up the
  *	accoding to the step_specs.
diff --git a/src/slurmctld/srun_comm.c b/src/slurmctld/srun_comm.c
index db063c26efa..fe02f9435c3 100644
--- a/src/slurmctld/srun_comm.c
+++ b/src/slurmctld/srun_comm.c
@@ -90,7 +90,7 @@ extern void srun_allocate (uint32_t job_id)
 		memcpy(msg_arg->node_addr, job_ptr->node_addr,
 				(sizeof(slurm_addr) * job_ptr->node_cnt));
 #ifdef HAVE_BGL
-		msg_arg->bgl_part_id    = xstrdup(DEFAULT_BGL_PART_ID);
+		msg_arg->bgl_part_id    = xstrdup(job_ptr->bgl_part_id);
 #endif
 		msg_arg->error_code	= SLURM_SUCCESS;
 		_srun_agent_launch(addr, job_ptr->host, 
-- 
GitLab