diff --git a/src/plugins/select/bluegene/select_bluegene.c b/src/plugins/select/bluegene/select_bluegene.c index 95c08d98b57aa4942355ddf517611ba1c362a480..b9f6dfbe3c696bf461975b49af5bb425b0bd06ac 100644 --- a/src/plugins/select/bluegene/select_bluegene.c +++ b/src/plugins/select/bluegene/select_bluegene.c @@ -2084,7 +2084,8 @@ extern int select_p_step_finish(struct step_record *step_ptr) xassert(step_ptr); - if (IS_JOB_COMPLETING(step_ptr->job_ptr)) { + if (IS_JOB_COMPLETING(step_ptr->job_ptr) || + IS_JOB_FINISHED(step_ptr->job_ptr)) { debug("step completion %u.%u was received after job " "allocation is already completing, no cleanup needed", step_ptr->job_ptr->job_id, step_ptr->step_id); diff --git a/src/plugins/select/cray/select_cray.c b/src/plugins/select/cray/select_cray.c index 0117fefb7c88f14820788e5e7dd599bc68fc3021..c12f9a2978fb4d4ec375e97e4ad29d1cfefe3018 100644 --- a/src/plugins/select/cray/select_cray.c +++ b/src/plugins/select/cray/select_cray.c @@ -1839,7 +1839,8 @@ extern int select_p_step_finish(struct step_record *step_ptr) /* The NHC needs to be ran after each step even if the job is about to * run the NHC for the allocation. The NHC developers feel this is * needed. If it ever changes just use this below code. */ - else if (IS_JOB_COMPLETING(step_ptr->job_ptr)) { + else if (IS_JOB_COMPLETING(step_ptr->job_ptr) || + IS_JOB_FINISHED(step_ptr->job_ptr)) { debug3("step completion %u.%u was received after job " "allocation is already completing, no extra NHC needed.", step_ptr->job_ptr->job_id, step_ptr->step_id); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index e97bd6c57abe41153280d8965e5c3198ce001625..f562bd630630b5eeef545d0094345657d971a86a 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -2705,8 +2705,8 @@ extern int kill_job_by_front_end_name(char *node_name) job_ptr->job_id); } if (job_ptr->node_cnt == 0) { - job_ptr->job_state &= (~JOB_COMPLETING); delete_step_records(job_ptr); + job_ptr->job_state &= (~JOB_COMPLETING); slurm_sched_g_schedule(); } node_ptr = &node_record_table_ptr[i]; @@ -2935,8 +2935,8 @@ extern int kill_running_job_by_node_name(char *node_name) job_ptr->job_id); } if (job_ptr->node_cnt == 0) { - job_ptr->job_state &= (~JOB_COMPLETING); delete_step_records(job_ptr); + job_ptr->job_state &= (~JOB_COMPLETING); slurm_sched_g_schedule(); } if (node_ptr->comp_job_cnt) @@ -6893,7 +6893,7 @@ static void _list_delete_job(void *job_entry) xfree(job_ptr->resp_host); xfree(job_ptr->resv_name); xfree(job_ptr->sched_nodes); - for (i=0; i<job_ptr->spank_job_env_size; i++) + for (i = 0; i < job_ptr->spank_job_env_size; i++) xfree(job_ptr->spank_job_env[i]); xfree(job_ptr->spank_job_env); xfree(job_ptr->state_desc); @@ -6901,8 +6901,8 @@ static void _list_delete_job(void *job_entry) delete_step_records(job_ptr); list_destroy(job_ptr->step_list); } - /* select_jobinfo is used in delete_step_records so free it - afterwards */ + /* select_jobinfo is used by delete_step_records(), so free it + * afterwards */ select_g_select_jobinfo_free(job_ptr->select_jobinfo); xfree(job_ptr->wckey); job_count--; diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index ec2e7df5a59067969db39b5db4dfb73184a114c6..1983b7485eab5a8fb4cb10dcc1e072ba9ae736de 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -3279,9 +3279,9 @@ cleanup_completing(struct job_record *job_ptr) __func__, job_ptr->job_id,(long) delay); } + delete_step_records(job_ptr); job_ptr->job_state &= (~JOB_COMPLETING); job_hold_requeue(job_ptr); - delete_step_records(job_ptr); slurm_sched_g_schedule(); } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 0026e4d1540561e0519e44113bcc0f28865e1a14..028a021aa5eca453440270db219e3a65b2cac983 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -597,8 +597,8 @@ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, #endif if ((agent_args->node_count - down_node_cnt) == 0) { - job_ptr->job_state &= (~JOB_COMPLETING); delete_step_records(job_ptr); + job_ptr->job_state &= (~JOB_COMPLETING); slurm_sched_g_schedule(); } @@ -2831,8 +2831,8 @@ extern void re_kill_job(struct job_record *job_ptr) if ((job_ptr->node_cnt > 0) && ((--job_ptr->node_cnt) == 0)) { last_node_update = time(NULL); - job_ptr->job_state &= (~JOB_COMPLETING); delete_step_records(job_ptr); + job_ptr->job_state &= (~JOB_COMPLETING); slurm_sched_g_schedule(); batch_requeue_fini(job_ptr); last_node_update = time(NULL); @@ -2860,8 +2860,8 @@ extern void re_kill_job(struct job_record *job_ptr) (node_ptr->comp_job_cnt)--; if ((job_ptr->node_cnt > 0) && ((--job_ptr->node_cnt) == 0)) { - job_ptr->job_state &= (~JOB_COMPLETING); delete_step_records(job_ptr); + job_ptr->job_state &= (~JOB_COMPLETING); slurm_sched_g_schedule(); batch_requeue_fini(job_ptr); last_node_update = time(NULL);