diff --git a/NEWS b/NEWS index d411524a37eb9503aa4f7ee3a2ced824ab5a07ff..1ce084fc928168c42b9440ace88e0a6b79ceb0a7 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,9 @@ documents those changes that are of interest to users and admins. -- Race condition for forwarding logic fix from Hongjia Cao -- Add support for Portable Linux Processor Affinity (PLPA, see http://www.open-mpi.org/software/plpa). + -- When a job epilog completes on all non-DOWN nodes, immediately purge + it's job steps that lack switch windows. Needed for LSF operation. + Based upon slurm.hp.node_fail.patch. -- Modify srun to ignore entries on --nodelist for job step creation if their count exceeds the task count. Based on slurm.hp.srun.patch. diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 46049981b436ac96c3abf239f5ce044bc2a4e65b..835e0853f41e608c0a6bd9953bf6764d81de2683 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -999,7 +999,8 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test) error("node_cnt underflow on JobId=%u", job_ptr->job_id); if (job_ptr->node_cnt == 0) { - job_ptr->job_state &= (~JOB_COMPLETING); + job_ptr->job_state &= (~JOB_COMPLETING);\ + delete_step_records(job_ptr, 1); slurm_sched_schedule(); } if (node_ptr->comp_job_cnt) @@ -2659,7 +2660,7 @@ static void _list_delete_job(void *job_entry) xfree(job_ptr->comment); select_g_free_jobinfo(&job_ptr->select_jobinfo); if (job_ptr->step_list) { - delete_all_step_records(job_ptr); + delete_step_records(job_ptr, 0); list_destroy(job_ptr->step_list); } job_count--; diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 045c0dcdcc79f2d97b041c5bd9cc1b6c89ab875b..28972b6de66f407d9c6b87a8e582ff416eac22d0 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -216,6 +216,7 @@ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, if ((agent_args->node_count - down_node_cnt) == 0) { job_ptr->job_state &= (~JOB_COMPLETING); + delete_step_records(job_ptr, 1); slurm_sched_schedule(); } if (agent_args->node_count == 0) { diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index bb03003b7de7d7aa804ee94e874f596e69b26551..eeac384e02e02fe540a01685a381d79cec36b6a1 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -496,10 +496,13 @@ extern struct part_record *create_part_record (void); extern struct step_record * create_step_record (struct job_record *job_ptr); /* - * delete_all_step_records - delete all step record for specified job_ptr - * IN job_ptr - pointer to job table entry to have step record added + * delete_step_records - delete step record for specified job_ptr + * IN job_ptr - pointer to job table entry to have step records removed + * IN filter - determine which job steps to delete + * 0: delete all job steps + * 1: delete only job steps without a switch allocation */ -extern void delete_all_step_records (struct job_record *job_ptr); +extern void delete_step_records (struct job_record *job_ptr, int filter); /* * delete_job_details - delete a job's detail record and clear it's pointer diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 1e428108101708bca1e3a924a407b06d6f49cc31..154b6e14f187ba2787b4fca9061cdc3240fe8371 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -104,11 +104,14 @@ create_step_record (struct job_record *job_ptr) /* - * delete_all_step_records - delete all step record for specified job_ptr - * IN job_ptr - pointer to job table entry to have step record added + * delete_step_records - delete step record for specified job_ptr + * IN job_ptr - pointer to job table entry to have step records removed + * IN filter - determine which job steps to delete + * 0: delete all job steps + * 1: delete only job steps without a switch allocation */ -void -delete_all_step_records (struct job_record *job_ptr) +extern void +delete_step_records (struct job_record *job_ptr, int filter) { ListIterator step_iterator; struct step_record *step_ptr; @@ -118,6 +121,9 @@ delete_all_step_records (struct job_record *job_ptr) last_job_update = time(NULL); while ((step_ptr = (struct step_record *) list_next (step_iterator))) { + if ((filter == 1) && (step_ptr->switch_job)) + continue; + list_remove (step_iterator); if (step_ptr->switch_job) { switch_g_job_step_complete(