From 184eb1733631f227bdc1a6baf4dcab946aa4f9eb Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 27 Sep 2002 19:04:02 +0000
Subject: [PATCH] Fix some batch job submit bugs.

---
 src/slurmctld/agent.c         | 38 +++++++++++++++++++++++++++++------
 src/slurmctld/job_mgr.c       | 19 +++++++++++++-----
 src/slurmctld/job_scheduler.c |  5 ++---
 src/slurmctld/slurmctld.h     |  1 +
 4 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c
index 6837952e11a..466520a202f 100644
--- a/src/slurmctld/agent.c
+++ b/src/slurmctld/agent.c
@@ -104,6 +104,7 @@ typedef struct task_info {
 
 static void alarm_handler(int dummy);
 static void queue_agent_retry (agent_info_t *agent_info_ptr, int count);
+static void slurmctld_free_job_launch_msg(batch_job_launch_msg_t * msg);
 static void spawn_retry_agent (agent_arg_t *agent_arg_ptr);
 static void *thread_per_node_rpc (void *args);
 static void *wdog (void *args);
@@ -141,7 +142,8 @@ agent (void *args)
 		fatal ("agent passed NULL node name list");
 	if ((agent_arg_ptr->msg_type != REQUEST_REVOKE_JOB_CREDENTIAL) &&
 	    (agent_arg_ptr->msg_type != REQUEST_NODE_REGISTRATION_STATUS) &&
-	    (agent_arg_ptr->msg_type != REQUEST_PING))
+	    (agent_arg_ptr->msg_type != REQUEST_PING) &&
+	    (agent_arg_ptr->msg_type != REQUEST_BATCH_JOB_LAUNCH))
 		fatal ("agent passed invalid message type %d", agent_arg_ptr->msg_type);
 
 	/* initialize the data structures */
@@ -239,8 +241,12 @@ cleanup:
 			xfree (agent_arg_ptr->slurm_addr);
 		if (agent_arg_ptr->node_names)
 			xfree (agent_arg_ptr->node_names);
-		if (agent_arg_ptr->msg_args)
-			xfree (agent_arg_ptr->msg_args);
+		if (agent_arg_ptr->msg_args) {
+			if (agent_arg_ptr->msg_type == REQUEST_BATCH_JOB_LAUNCH) 
+				slurmctld_free_job_launch_msg (agent_arg_ptr->msg_args);
+			else
+				xfree (agent_arg_ptr->msg_args);
+		}
 		xfree (agent_arg_ptr);
 	}
 #endif
@@ -437,13 +443,13 @@ thread_per_node_rpc (void *args)
 			rc = slurm_rc_msg->return_code;
 			slurm_free_return_code_msg ( slurm_rc_msg );	
 			if (rc)
-				error ("thread_per_node_rpc/rc error %d", rc);
+				error ("thread_per_node_rpc/rc error %s", 
+				       slurm_strerror (rc));
 			else {
 				debug3 ("agent sucessfully processed RPC to node %s", 
 				        thread_ptr->node_name);
-				thread_state = DSH_DONE;
 			}
-
+			thread_state = DSH_DONE;
 			break ;
 		default:
 			error ("thread_per_node_rpc bad msg_type %d",response_msg.msg_type);
@@ -608,3 +614,23 @@ spawn_retry_agent (agent_arg_t *agent_arg_ptr)
 			fatal ("pthread_create error %m");
 	}
 }
+
+/* slurmctld_free_job_launch_msg is a variant of slurm_free_job_launch_msg
+ *	because all environment variables currently loaded in one xmalloc 
+ *	buffer (see get_job_env()), which is different from how slurmd 
+ *	assembles the data from a message */
+
+void slurmctld_free_job_launch_msg(batch_job_launch_msg_t * msg)
+{
+	if (msg) {
+		if (msg->environment) {
+			if (msg->environment[0])
+				xfree(msg->environment[0]);
+
+			xfree(msg->environment);
+			msg->environment = NULL;
+		}
+		slurm_free_job_launch_msg (msg);
+	}
+}
+
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 5881a30144e..4f84265587d 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -361,9 +361,11 @@ dump_job_details_state (struct job_details *detail_ptr, void **buf_ptr, int *buf
 
 	pack32  ((uint32_t) detail_ptr->num_procs, buf_ptr, buf_len);
 	pack32  ((uint32_t) detail_ptr->num_nodes, buf_ptr, buf_len);
+
 	pack16  ((uint16_t) detail_ptr->shared, buf_ptr, buf_len);
 	pack16  ((uint16_t) detail_ptr->contiguous, buf_ptr, buf_len);
 	pack16  ((uint16_t) detail_ptr->kill_on_node_fail, buf_ptr, buf_len);
+	pack16  ((uint16_t) detail_ptr->batch_flag, buf_ptr, buf_len);
 
 	pack32  ((uint32_t) detail_ptr->min_procs, buf_ptr, buf_len);
 	pack32  ((uint32_t) detail_ptr->min_memory, buf_ptr, buf_len);
@@ -466,7 +468,7 @@ load_job_state ( void )
 	uint16_t job_state, next_step_id, details;
 	char *nodes = NULL, *partition = NULL, *name = NULL;
 	uint32_t num_procs, num_nodes, min_procs, min_memory, min_tmp_disk, submit_time;
-	uint16_t shared, contiguous, kill_on_node_fail, name_len;
+	uint16_t shared, contiguous, kill_on_node_fail, name_len, batch_flag;
 	char *req_nodes = NULL, *features = NULL;
 	char  *stderr = NULL, *stdin = NULL, *stdout = NULL, *work_dir = NULL;
 	slurm_job_credential_t *credential_ptr = NULL;
@@ -536,9 +538,11 @@ load_job_state ( void )
 
 			safe_unpack32 (&num_procs, &buf_ptr, &buffer_size);
 			safe_unpack32 (&num_nodes, &buf_ptr, &buffer_size);
+
 			safe_unpack16 (&shared, &buf_ptr, &buffer_size);
 			safe_unpack16 (&contiguous, &buf_ptr, &buffer_size);
 			safe_unpack16 (&kill_on_node_fail, &buf_ptr, &buffer_size);
+			safe_unpack16 (&batch_flag, &buf_ptr, &buffer_size);
 
 			safe_unpack32 (&min_procs, &buf_ptr, &buffer_size);
 			safe_unpack32 (&min_memory, &buf_ptr, &buffer_size);
@@ -617,7 +621,7 @@ load_job_state ( void )
 			job_ptr->details->shared = shared;
 			job_ptr->details->contiguous = contiguous;
 			job_ptr->details->kill_on_node_fail = kill_on_node_fail;
-			job_ptr->details->kill_on_node_fail = 1;
+			job_ptr->details->batch_flag = batch_flag;
 			job_ptr->details->min_procs = min_procs;
 			job_ptr->details->min_memory = min_memory;
 			job_ptr->details->min_tmp_disk = min_tmp_disk;
@@ -1275,10 +1279,15 @@ job_create ( job_desc_msg_t *job_desc, uint32_t *new_job_id, int allocate,
 		goto cleanup ;
 	}
 
-	if ( ( error_code = copy_job_desc_to_file ( job_desc , (*job_rec_ptr)->job_id ) ) )  {
-		error_code = ESLURM_WRITING_TO_FILE ;
-		goto cleanup ;
+	if (job_desc->script) {
+		if ( ( error_code = copy_job_desc_to_file ( job_desc , (*job_rec_ptr)->job_id ) ) )  {
+			error_code = ESLURM_WRITING_TO_FILE ;
+			goto cleanup ;
+		}
+		(*job_rec_ptr)->details->batch_flag = 1;
 	}
+	else
+		(*job_rec_ptr)->details->batch_flag = 0;
 
 	if (part_ptr->shared == SHARED_FORCE)		/* shared=force */
 		(*job_rec_ptr)->details->shared = 1;
diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c
index 29f6661194d..864e7a1d092 100644
--- a/src/slurmctld/job_scheduler.c
+++ b/src/slurmctld/job_scheduler.c
@@ -208,7 +208,7 @@ launch_job (struct job_record *job_ptr)
 	pthread_attr_t attr_agent;
 	pthread_t thread_agent;
 
-/*	if (job_ptr->details->batch_flag == 0) */
+	if (job_ptr->details->batch_flag == 0)
 		return;
 
 	node_ptr = find_first_node_record (job_ptr -> node_bitmap);
@@ -235,10 +235,9 @@ launch_job (struct job_record *job_ptr)
 	agent_arg_ptr -> slurm_addr = xmalloc (sizeof (struct sockaddr_in));
 	memcpy (agent_arg_ptr -> slurm_addr, 
 		&(node_ptr -> slurm_addr), sizeof (struct sockaddr_in));
-	agent_arg_ptr -> node_names = node_ptr -> name;
+	agent_arg_ptr -> node_names = xstrdup (node_ptr -> name);
 	agent_arg_ptr -> msg_type = REQUEST_BATCH_JOB_LAUNCH;
 	agent_arg_ptr -> msg_args = (void *)launch_msg_ptr;
-/* FIXME: Agent must perform full data structure cleanup for launch_msg_ptr */
 
 	/* Launch the RPC via agent */
 	debug3 ("Spawning job launch agent for job_id %u", job_ptr -> job_id);
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 0f321c41861..d7c73171897 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -171,6 +171,7 @@ extern int job_count;			/* number of jobs in the system */
 /* job_details - specification of a job's constraints */
 struct job_details {
 	uint32_t magic;			/* magic cookie to test data integrity */
+	uint16_t batch_flag;		/* 1 if batch job (with script) */
 	uint32_t num_procs;		/* minimum number of processors */
 	uint32_t num_nodes;		/* minimum number of nodes */
 	char *req_nodes;		/* required nodes */
-- 
GitLab