From 5d9fa3f6515d67d6ee9755f3a21f5aae97377358 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 3 Oct 2003 20:59:14 +0000 Subject: [PATCH] Resend KILL_JOB message if a completing job hangs. This can recover from the lost of an EPILOG_COMPLETE message. --- src/slurmctld/job_mgr.c | 12 +++-- src/slurmctld/node_scheduler.c | 93 +++++++++++++++++++++++++++++++--- src/slurmctld/slurmctld.h | 13 ++++- 3 files changed, 104 insertions(+), 14 deletions(-) diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index e175a7cf47a..faf88e2374b 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -2159,11 +2159,15 @@ static int _list_find_job_old(void *job_entry, void *key) if (job_ptr->end_time > min_age) return 0; /* Too new to purge */ - if ((!(IS_JOB_FINISHED(job_ptr))) || - (job_ptr->job_state & JOB_COMPLETING)) - return 0; /* Still active, can't purge */ + if (!(IS_JOB_FINISHED(job_ptr))) + return 0; /* Job still active */ - return 1; + if (job_ptr->job_state & JOB_COMPLETING) { + re_kill_job(job_ptr); + return 0; /* Job still completing */ + } + + return 1; /* Purge the job */ } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 820f4ec0b4e..c40e03437ca 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -159,12 +159,11 @@ void deallocate_nodes(struct job_record *job_ptr, bool timeout) kill_job->job_uid = job_ptr->user_id; for (i = 0; i < node_record_count; i++) { + struct node_record *node_ptr = &node_record_table_ptr[i]; if (bit_test(job_ptr->node_bitmap, i) == 0) continue; - base_state = node_record_table_ptr[i].node_state & - (~NODE_STATE_NO_RESPOND); - no_resp_flag = node_record_table_ptr[i].node_state & - NODE_STATE_NO_RESPOND; + base_state = node_ptr->node_state & (~NODE_STATE_NO_RESPOND); + no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND; if ((base_state == NODE_STATE_DOWN) || no_resp_flag) { /* Issue the KILL RPC, but don't verify response */ down_node_cnt++; @@ -180,12 +179,12 @@ void deallocate_nodes(struct job_record *job_ptr, bool timeout) (MAX_NAME_LEN * buf_rec_size)); } agent_args->slurm_addr[agent_args->node_count] = - node_record_table_ptr[i].slurm_addr; + node_ptr->slurm_addr; strncpy(&agent_args-> node_names[MAX_NAME_LEN * agent_args->node_count], - node_record_table_ptr[i].name, MAX_NAME_LEN); + node_ptr->name, MAX_NAME_LEN); agent_args->node_count++; - make_node_comp(&node_record_table_ptr[i]); + make_node_comp(node_ptr); } if ((agent_args->node_count - down_node_cnt) == 0) @@ -193,6 +192,7 @@ void deallocate_nodes(struct job_record *job_ptr, bool timeout) if (agent_args->node_count == 0) { error("Job %u allocated no nodes to be killed on", job_ptr->job_id); + xfree(kill_job); xfree(agent_args); return; } @@ -218,7 +218,6 @@ void deallocate_nodes(struct job_record *job_ptr, bool timeout) return; } - /* * _match_feature - determine if the desired feature is one of those available * IN seek - desired feature @@ -1182,3 +1181,81 @@ static int _valid_features(char *requested, char *available) xfree(tmp_requested); return result; } + +/* + * re_kill_job - for a given job, deallocate its nodes for a second time, + * basically a cleanup for failed deallocate() calls + * IN job_ptr - pointer to terminating job (already in some COMPLETING state) + * globals: node_record_count - number of nodes in the system + * node_record_table_ptr - pointer to global node table + */ +extern void re_kill_job(struct job_record *job_ptr) +{ + int i, retries = 0; + kill_job_msg_t *kill_job; + agent_arg_t *agent_args; + pthread_attr_t attr_agent; + pthread_t thread_agent; + int buf_rec_size = 0; + + xassert(job_ptr); + xassert(job_ptr->details); + + agent_args = xmalloc(sizeof(agent_arg_t)); + agent_args->msg_type = REQUEST_KILL_JOB; + agent_args->retry = 0; + kill_job = xmalloc(sizeof(kill_job_msg_t)); + last_node_update = time(NULL); + kill_job->job_id = job_ptr->job_id; + kill_job->job_uid = job_ptr->user_id; + + for (i = 0; i < node_record_count; i++) { + struct node_record *node_ptr = &node_record_table_ptr[i]; + if (bit_test(job_ptr->node_bitmap, i) == 0) + continue; + if (node_ptr->node_state & NODE_STATE_NO_RESPOND) + continue; + info("Resending KILL_JOB request for JobId=%u, Node=%s", + job_ptr->job_id, node_ptr->name); + if ((agent_args->node_count + 1) > buf_rec_size) { + buf_rec_size += 32; + xrealloc((agent_args->slurm_addr), + (sizeof(struct sockaddr_in) * + buf_rec_size)); + xrealloc((agent_args->node_names), + (MAX_NAME_LEN * buf_rec_size)); + } + agent_args->slurm_addr[agent_args->node_count] = + node_ptr->slurm_addr; + strncpy(&agent_args-> + node_names[MAX_NAME_LEN * agent_args->node_count], + node_ptr->name, MAX_NAME_LEN); + agent_args->node_count++; + } + + if (agent_args->node_count == 0) { + xfree(kill_job); + xfree(agent_args); + return; + } + + agent_args->msg_args = kill_job; + debug2("Spawning job kill agent"); + if (pthread_attr_init(&attr_agent)) + fatal("pthread_attr_init error %m"); + if (pthread_attr_setdetachstate + (&attr_agent, PTHREAD_CREATE_DETACHED)) + error("pthread_attr_setdetachstate error %m"); +#ifdef PTHREAD_SCOPE_SYSTEM + if (pthread_attr_setscope(&attr_agent, PTHREAD_SCOPE_SYSTEM)) + error("pthread_attr_setscope error %m"); +#endif + while (pthread_create(&thread_agent, &attr_agent, agent, + (void *) agent_args)) { + error("pthread_create error %m"); + if (++retries > MAX_RETRIES) + fatal("Can't create pthread"); + sleep(1); /* sleep and try again */ + } + return; +} diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 39f2024e0a4..62a6f14ee48 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -888,11 +888,20 @@ void part_fini (void); * global: job_list - global job table * last_job_update - time of last job table update */ -void purge_old_job (void); +extern void purge_old_job (void); + +/* + * re_kill_job - for a given job, deallocate its nodes for a second time, + * basically a cleanup for failed deallocate() calls + * IN job_ptr - pointer to terminating job (already in some COMPLETING state) + * globals: node_record_count - number of nodes in the system + * node_record_table_ptr - pointer to global node table + */ +extern void re_kill_job(struct job_record *job_ptr); /* rehash_jobs - Create or rebuild the job rehash table. Actually for now we * just preserve it */ -void rehash_jobs(void); +extern void rehash_jobs(void); /* * rehash_node - build a hash table of the node_record entries. this is a large -- GitLab