From f915ef34783777e3604ca6433c77394dba17a0cc Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 9 Nov 2006 20:01:24 +0000 Subject: [PATCH] When a job epilog completes on all non-DOWN nodes, immediately purge it's job steps that lack switch windows. Needed for LSF operation. Based upon slurm.hp.node_fail.patch. --- NEWS | 3 +++ src/slurmctld/job_mgr.c | 5 +++-- src/slurmctld/node_scheduler.c | 1 + src/slurmctld/slurmctld.h | 9 ++++++--- src/slurmctld/step_mgr.c | 14 ++++++++++---- 5 files changed, 23 insertions(+), 9 deletions(-) diff --git a/NEWS b/NEWS index d411524a37e..1ce084fc928 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,9 @@ documents those changes that are of interest to users and admins. -- Race condition for forwarding logic fix from Hongjia Cao -- Add support for Portable Linux Processor Affinity (PLPA, see http://www.open-mpi.org/software/plpa). + -- When a job epilog completes on all non-DOWN nodes, immediately purge + it's job steps that lack switch windows. Needed for LSF operation. + Based upon slurm.hp.node_fail.patch. -- Modify srun to ignore entries on --nodelist for job step creation if their count exceeds the task count. Based on slurm.hp.srun.patch. diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 46049981b43..835e0853f41 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -999,7 +999,8 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test) error("node_cnt underflow on JobId=%u", job_ptr->job_id); if (job_ptr->node_cnt == 0) { - job_ptr->job_state &= (~JOB_COMPLETING); + job_ptr->job_state &= (~JOB_COMPLETING);\ + delete_step_records(job_ptr, 1); slurm_sched_schedule(); } if (node_ptr->comp_job_cnt) @@ -2659,7 +2660,7 @@ static void _list_delete_job(void *job_entry) xfree(job_ptr->comment); select_g_free_jobinfo(&job_ptr->select_jobinfo); if (job_ptr->step_list) { - delete_all_step_records(job_ptr); + delete_step_records(job_ptr, 0); list_destroy(job_ptr->step_list); } job_count--; diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 045c0dcdcc7..28972b6de66 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -216,6 +216,7 @@ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, if ((agent_args->node_count - down_node_cnt) == 0) { job_ptr->job_state &= (~JOB_COMPLETING); + delete_step_records(job_ptr, 1); slurm_sched_schedule(); } if (agent_args->node_count == 0) { diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index bb03003b7de..eeac384e02e 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -496,10 +496,13 @@ extern struct part_record *create_part_record (void); extern struct step_record * create_step_record (struct job_record *job_ptr); /* - * delete_all_step_records - delete all step record for specified job_ptr - * IN job_ptr - pointer to job table entry to have step record added + * delete_step_records - delete step record for specified job_ptr + * IN job_ptr - pointer to job table entry to have step records removed + * IN filter - determine which job steps to delete + * 0: delete all job steps + * 1: delete only job steps without a switch allocation */ -extern void delete_all_step_records (struct job_record *job_ptr); +extern void delete_step_records (struct job_record *job_ptr, int filter); /* * delete_job_details - delete a job's detail record and clear it's pointer diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 1e428108101..154b6e14f18 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -104,11 +104,14 @@ create_step_record (struct job_record *job_ptr) /* - * delete_all_step_records - delete all step record for specified job_ptr - * IN job_ptr - pointer to job table entry to have step record added + * delete_step_records - delete step record for specified job_ptr + * IN job_ptr - pointer to job table entry to have step records removed + * IN filter - determine which job steps to delete + * 0: delete all job steps + * 1: delete only job steps without a switch allocation */ -void -delete_all_step_records (struct job_record *job_ptr) +extern void +delete_step_records (struct job_record *job_ptr, int filter) { ListIterator step_iterator; struct step_record *step_ptr; @@ -118,6 +121,9 @@ delete_all_step_records (struct job_record *job_ptr) last_job_update = time(NULL); while ((step_ptr = (struct step_record *) list_next (step_iterator))) { + if ((filter == 1) && (step_ptr->switch_job)) + continue; + list_remove (step_iterator); if (step_ptr->switch_job) { switch_g_job_step_complete( -- GitLab