From 184eb1733631f227bdc1a6baf4dcab946aa4f9eb Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 27 Sep 2002 19:04:02 +0000 Subject: [PATCH] Fix some batch job submit bugs. --- src/slurmctld/agent.c | 38 +++++++++++++++++++++++++++++------ src/slurmctld/job_mgr.c | 19 +++++++++++++----- src/slurmctld/job_scheduler.c | 5 ++--- src/slurmctld/slurmctld.h | 1 + 4 files changed, 49 insertions(+), 14 deletions(-) diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 6837952e11a..466520a202f 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -104,6 +104,7 @@ typedef struct task_info { static void alarm_handler(int dummy); static void queue_agent_retry (agent_info_t *agent_info_ptr, int count); +static void slurmctld_free_job_launch_msg(batch_job_launch_msg_t * msg); static void spawn_retry_agent (agent_arg_t *agent_arg_ptr); static void *thread_per_node_rpc (void *args); static void *wdog (void *args); @@ -141,7 +142,8 @@ agent (void *args) fatal ("agent passed NULL node name list"); if ((agent_arg_ptr->msg_type != REQUEST_REVOKE_JOB_CREDENTIAL) && (agent_arg_ptr->msg_type != REQUEST_NODE_REGISTRATION_STATUS) && - (agent_arg_ptr->msg_type != REQUEST_PING)) + (agent_arg_ptr->msg_type != REQUEST_PING) && + (agent_arg_ptr->msg_type != REQUEST_BATCH_JOB_LAUNCH)) fatal ("agent passed invalid message type %d", agent_arg_ptr->msg_type); /* initialize the data structures */ @@ -239,8 +241,12 @@ cleanup: xfree (agent_arg_ptr->slurm_addr); if (agent_arg_ptr->node_names) xfree (agent_arg_ptr->node_names); - if (agent_arg_ptr->msg_args) - xfree (agent_arg_ptr->msg_args); + if (agent_arg_ptr->msg_args) { + if (agent_arg_ptr->msg_type == REQUEST_BATCH_JOB_LAUNCH) + slurmctld_free_job_launch_msg (agent_arg_ptr->msg_args); + else + xfree (agent_arg_ptr->msg_args); + } xfree (agent_arg_ptr); } #endif @@ -437,13 +443,13 @@ thread_per_node_rpc (void *args) rc = slurm_rc_msg->return_code; slurm_free_return_code_msg ( slurm_rc_msg ); if (rc) - error ("thread_per_node_rpc/rc error %d", rc); + error ("thread_per_node_rpc/rc error %s", + slurm_strerror (rc)); else { debug3 ("agent sucessfully processed RPC to node %s", thread_ptr->node_name); - thread_state = DSH_DONE; } - + thread_state = DSH_DONE; break ; default: error ("thread_per_node_rpc bad msg_type %d",response_msg.msg_type); @@ -608,3 +614,23 @@ spawn_retry_agent (agent_arg_t *agent_arg_ptr) fatal ("pthread_create error %m"); } } + +/* slurmctld_free_job_launch_msg is a variant of slurm_free_job_launch_msg + * because all environment variables currently loaded in one xmalloc + * buffer (see get_job_env()), which is different from how slurmd + * assembles the data from a message */ + +void slurmctld_free_job_launch_msg(batch_job_launch_msg_t * msg) +{ + if (msg) { + if (msg->environment) { + if (msg->environment[0]) + xfree(msg->environment[0]); + + xfree(msg->environment); + msg->environment = NULL; + } + slurm_free_job_launch_msg (msg); + } +} + diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 5881a30144e..4f84265587d 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -361,9 +361,11 @@ dump_job_details_state (struct job_details *detail_ptr, void **buf_ptr, int *buf pack32 ((uint32_t) detail_ptr->num_procs, buf_ptr, buf_len); pack32 ((uint32_t) detail_ptr->num_nodes, buf_ptr, buf_len); + pack16 ((uint16_t) detail_ptr->shared, buf_ptr, buf_len); pack16 ((uint16_t) detail_ptr->contiguous, buf_ptr, buf_len); pack16 ((uint16_t) detail_ptr->kill_on_node_fail, buf_ptr, buf_len); + pack16 ((uint16_t) detail_ptr->batch_flag, buf_ptr, buf_len); pack32 ((uint32_t) detail_ptr->min_procs, buf_ptr, buf_len); pack32 ((uint32_t) detail_ptr->min_memory, buf_ptr, buf_len); @@ -466,7 +468,7 @@ load_job_state ( void ) uint16_t job_state, next_step_id, details; char *nodes = NULL, *partition = NULL, *name = NULL; uint32_t num_procs, num_nodes, min_procs, min_memory, min_tmp_disk, submit_time; - uint16_t shared, contiguous, kill_on_node_fail, name_len; + uint16_t shared, contiguous, kill_on_node_fail, name_len, batch_flag; char *req_nodes = NULL, *features = NULL; char *stderr = NULL, *stdin = NULL, *stdout = NULL, *work_dir = NULL; slurm_job_credential_t *credential_ptr = NULL; @@ -536,9 +538,11 @@ load_job_state ( void ) safe_unpack32 (&num_procs, &buf_ptr, &buffer_size); safe_unpack32 (&num_nodes, &buf_ptr, &buffer_size); + safe_unpack16 (&shared, &buf_ptr, &buffer_size); safe_unpack16 (&contiguous, &buf_ptr, &buffer_size); safe_unpack16 (&kill_on_node_fail, &buf_ptr, &buffer_size); + safe_unpack16 (&batch_flag, &buf_ptr, &buffer_size); safe_unpack32 (&min_procs, &buf_ptr, &buffer_size); safe_unpack32 (&min_memory, &buf_ptr, &buffer_size); @@ -617,7 +621,7 @@ load_job_state ( void ) job_ptr->details->shared = shared; job_ptr->details->contiguous = contiguous; job_ptr->details->kill_on_node_fail = kill_on_node_fail; - job_ptr->details->kill_on_node_fail = 1; + job_ptr->details->batch_flag = batch_flag; job_ptr->details->min_procs = min_procs; job_ptr->details->min_memory = min_memory; job_ptr->details->min_tmp_disk = min_tmp_disk; @@ -1275,10 +1279,15 @@ job_create ( job_desc_msg_t *job_desc, uint32_t *new_job_id, int allocate, goto cleanup ; } - if ( ( error_code = copy_job_desc_to_file ( job_desc , (*job_rec_ptr)->job_id ) ) ) { - error_code = ESLURM_WRITING_TO_FILE ; - goto cleanup ; + if (job_desc->script) { + if ( ( error_code = copy_job_desc_to_file ( job_desc , (*job_rec_ptr)->job_id ) ) ) { + error_code = ESLURM_WRITING_TO_FILE ; + goto cleanup ; + } + (*job_rec_ptr)->details->batch_flag = 1; } + else + (*job_rec_ptr)->details->batch_flag = 0; if (part_ptr->shared == SHARED_FORCE) /* shared=force */ (*job_rec_ptr)->details->shared = 1; diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 29f6661194d..864e7a1d092 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -208,7 +208,7 @@ launch_job (struct job_record *job_ptr) pthread_attr_t attr_agent; pthread_t thread_agent; -/* if (job_ptr->details->batch_flag == 0) */ + if (job_ptr->details->batch_flag == 0) return; node_ptr = find_first_node_record (job_ptr -> node_bitmap); @@ -235,10 +235,9 @@ launch_job (struct job_record *job_ptr) agent_arg_ptr -> slurm_addr = xmalloc (sizeof (struct sockaddr_in)); memcpy (agent_arg_ptr -> slurm_addr, &(node_ptr -> slurm_addr), sizeof (struct sockaddr_in)); - agent_arg_ptr -> node_names = node_ptr -> name; + agent_arg_ptr -> node_names = xstrdup (node_ptr -> name); agent_arg_ptr -> msg_type = REQUEST_BATCH_JOB_LAUNCH; agent_arg_ptr -> msg_args = (void *)launch_msg_ptr; -/* FIXME: Agent must perform full data structure cleanup for launch_msg_ptr */ /* Launch the RPC via agent */ debug3 ("Spawning job launch agent for job_id %u", job_ptr -> job_id); diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 0f321c41861..d7c73171897 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -171,6 +171,7 @@ extern int job_count; /* number of jobs in the system */ /* job_details - specification of a job's constraints */ struct job_details { uint32_t magic; /* magic cookie to test data integrity */ + uint16_t batch_flag; /* 1 if batch job (with script) */ uint32_t num_procs; /* minimum number of processors */ uint32_t num_nodes; /* minimum number of nodes */ char *req_nodes; /* required nodes */ -- GitLab