From 25227d02af5919c7f90ee626ceff2c70d2fd013d Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 20 Dec 2004 17:28:45 +0000 Subject: [PATCH] Fix how job kill RPC is handled, include node_select data if available, don't seg fault if data is missing, free the data when the agent is finished with it. --- src/common/node_select.c | 18 ++++++++++-------- src/common/slurm_protocol_defs.h | 3 ++- src/slurmctld/agent.c | 3 +++ src/slurmctld/job_mgr.c | 19 +++++++++++++------ src/slurmctld/node_mgr.c | 8 ++++---- src/slurmctld/slurmctld.h | 2 ++ 6 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/common/node_select.c b/src/common/node_select.c index 24380651d7e..92267a6c65f 100644 --- a/src/common/node_select.c +++ b/src/common/node_select.c @@ -10,7 +10,7 @@ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Morris Jette <jette@llnl.gov>. + * Written by Morris Jette <jette1@llnl.gov>. * UCRL-CODE-2002-040. * * This file is part of SLURM, a resource management program. @@ -409,7 +409,7 @@ extern int select_g_set_jobinfo (select_jobinfo_t jobinfo, char * tmp_char = (char *) data; if (jobinfo->magic != JOBINFO_MAGIC) { - error("select_p_set_jobinfo: jobinfo magic bad"); + error("select_g_set_jobinfo: jobinfo magic bad"); return SLURM_ERROR; } @@ -452,7 +452,7 @@ extern int select_g_get_jobinfo (select_jobinfo_t jobinfo, char **tmp_char = (char **) data; if (jobinfo->magic != JOBINFO_MAGIC) { - error("select_p_set_jobinfo: jobinfo magic bad"); + error("select_g_get_jobinfo: jobinfo magic bad"); return SLURM_ERROR; } @@ -494,8 +494,10 @@ extern select_jobinfo_t select_g_copy_jobinfo(select_jobinfo_t jobinfo) { struct select_jobinfo *rc = NULL; - if (jobinfo->magic != JOBINFO_MAGIC) - error("select_p_copy_jobinfo: jobinfo magic bad"); + if (jobinfo == NULL) + ; + else if (jobinfo->magic != JOBINFO_MAGIC) + error("select_g_copy_jobinfo: jobinfo magic bad"); else { int i; rc = xmalloc(sizeof(struct select_jobinfo)); @@ -523,7 +525,7 @@ extern int select_g_free_jobinfo (select_jobinfo_t *jobinfo) if (*jobinfo == NULL) /* never set, treat as not an error */ ; else if ((*jobinfo)->magic != JOBINFO_MAGIC) { - error("select_p_set_jobinfo: jobinfo magic bad"); + error("select_g_free_jobinfo: jobinfo magic bad"); rc = EINVAL; } else { (*jobinfo)->magic = 0; @@ -595,13 +597,13 @@ extern char *select_g_sprint_jobinfo(select_jobinfo_t jobinfo, int i; if (buf == NULL) { - error("select_p_sprint_jobinfo: buf is null"); + error("select_g_sprint_jobinfo: buf is null"); return NULL; } if ((mode != SELECT_PRINT_DATA) && jobinfo && (jobinfo->magic != JOBINFO_MAGIC)) { - error("select_p_copy_jobinfo: jobinfo magic bad"); + error("select_g_sprint_jobinfo: jobinfo magic bad"); return NULL; } diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 480ceb211cd..7cae18d1a95 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -319,7 +319,8 @@ typedef struct return_code_msg { /* Note: We include select_jobinfo here in addition to the job launch * RPC in order to insure reliable clean-up of a BlueGene partition in * the event of some launch failure or race condition preventing slurmd - * from getting the BGL_PARTITION_ID at that time */ + * from getting the BGL_PARTITION_ID at that time. It is needed for + * the job epilog. */ typedef struct kill_job_msg { uint32_t job_id; uint32_t job_uid; diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 7e34c9605e8..9de112fe525 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -935,6 +935,9 @@ static void _purge_agent_args(agent_arg_t *agent_arg_ptr) RESPONSE_RESOURCE_ALLOCATION) slurm_free_resource_allocation_response_msg( agent_arg_ptr->msg_args); + else if (agent_arg_ptr->msg_type == + REQUEST_KILL_JOB) + slurm_free_kill_job_msg(agent_arg_ptr->msg_args); else xfree(agent_arg_ptr->msg_args); } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 8c6a74885ce..1ca40dd9573 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -3177,7 +3177,7 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, if (job_ptr == NULL) { error("Orphan job %u.%u reported on node %s", job_id_ptr[i], step_id_ptr[i], node_name); - kill_job_on_node(job_id_ptr[i], node_ptr); + kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr); } else if (job_ptr->job_state == JOB_RUNNING) { @@ -3196,14 +3196,15 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, error ("Registered job %u.%u on wrong node %s ", job_id_ptr[i], step_id_ptr[i], node_name); - kill_job_on_node(job_id_ptr[i], node_ptr); + kill_job_on_node(job_id_ptr[i], job_ptr, + node_ptr); } } else if (job_ptr->job_state & JOB_COMPLETING) { /* Re-send kill request as needed, * not necessarily an error */ - kill_job_on_node(job_id_ptr[i], node_ptr); + kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr); } @@ -3215,7 +3216,7 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, last_job_update = now; job_ptr->start_time = job_ptr->end_time = now; delete_job_details(job_ptr); - kill_job_on_node(job_id_ptr[i], node_ptr); + kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr); job_completion_logger(job_ptr); } @@ -3225,7 +3226,7 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, job_id_ptr[i], step_id_ptr[i], job_state_string(job_ptr->job_state), node_name); - kill_job_on_node(job_id_ptr[i], node_ptr); + kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr); } } @@ -3277,10 +3278,12 @@ static void _purge_lost_batch_jobs(int node_inx, time_t now) * these jobs and use this function to kill them - one * agent request per node as they register. * IN job_id - id of the job to be killed + * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned) * IN node_ptr - pointer to the node on which the job resides */ extern void -kill_job_on_node(uint32_t job_id, struct node_record *node_ptr) +kill_job_on_node(uint32_t job_id, struct job_record *job_ptr, + struct node_record *node_ptr) { agent_arg_t *agent_info; kill_job_msg_t *kill_req; @@ -3289,6 +3292,10 @@ kill_job_on_node(uint32_t job_id, struct node_record *node_ptr) kill_req = xmalloc(sizeof(kill_job_msg_t)); kill_req->job_id = job_id; + if (job_ptr) { /* NULL if unknown */ + kill_req->select_jobinfo = select_g_copy_jobinfo( + job_ptr->select_jobinfo); + } agent_info = xmalloc(sizeof(agent_arg_t)); agent_info->node_count = 1; diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 04a31d29bdc..30e54b5b325 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1182,7 +1182,7 @@ extern int validate_nodes_via_front_end(uint32_t job_count, if (job_ptr == NULL) { error("Orphan job %u.%u reported", job_id_ptr[i], step_id_ptr[i]); - kill_job_on_node(job_id_ptr[i], node_ptr); + kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr); } else if (job_ptr->job_state == JOB_RUNNING) { @@ -1197,7 +1197,7 @@ extern int validate_nodes_via_front_end(uint32_t job_count, else if (job_ptr->job_state & JOB_COMPLETING) { /* Re-send kill request as needed, * not necessarily an error */ - kill_job_on_node(job_id_ptr[i], node_ptr); + kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr); } @@ -1209,7 +1209,7 @@ extern int validate_nodes_via_front_end(uint32_t job_count, last_job_update = now; job_ptr->start_time = job_ptr->end_time = now; delete_job_details(job_ptr); - kill_job_on_node(job_id_ptr[i], node_ptr); + kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr); job_completion_logger(job_ptr); } @@ -1217,7 +1217,7 @@ extern int validate_nodes_via_front_end(uint32_t job_count, error("Registered job %u.%u in state %s", job_id_ptr[i], step_id_ptr[i], job_state_string(job_ptr->job_state)); - kill_job_on_node(job_id_ptr[i], node_ptr); + kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr); } } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 9a23b1082c8..e4b3f0483b1 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -736,9 +736,11 @@ extern int kill_job_by_part_name(char *part_name); * these jobs and use this function to kill them - one * agent request per node as they register. * IN job_id - id of the job to be killed + * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned) * IN node_ptr - pointer to the node on which the job resides */ extern void kill_job_on_node(uint32_t job_id, + struct job_record *job_ptr, struct node_record *node_ptr); /* -- GitLab