From 7734d9c0a057420c73e5ee8097c3c2b04ee9e0ea Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 11 Aug 2009 20:54:30 +0000 Subject: [PATCH] avoid resuming a job in gang that was not suspended by gang (e.g. admin action). add some more logging --- src/slurmctld/gang.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/slurmctld/gang.c b/src/slurmctld/gang.c index 58c3e411cb4..ea4d2ad8b39 100644 --- a/src/slurmctld/gang.c +++ b/src/slurmctld/gang.c @@ -1430,6 +1430,8 @@ extern int gs_reconfig(void) for (i = 0; i < p_ptr->num_jobs; i++) { if (p_ptr->job_list[i]->sig_state == GS_SUSPEND) { + info("resuming job in missing part %s", + p_ptr->part_name); _resume_job(p_ptr->job_list[i]-> job_id); p_ptr->job_list[i]->sig_state = @@ -1447,8 +1449,8 @@ extern int gs_reconfig(void) * necessary). NOTE: there could be jobs that only overlap * on nodes that are no longer in the partition, but we're * not going to worry about those cases. - */ - /* add the jobs from p_ptr into new_ptr in their current order + * + * add the jobs from p_ptr into new_ptr in their current order * to preserve the state of timeslicing. */ for (i = 0; i < p_ptr->num_jobs; i++) { @@ -1457,9 +1459,13 @@ extern int gs_reconfig(void) /* job no longer exists in SLURM, so drop it */ continue; } - /* resume any job that is suspended */ - if (IS_JOB_SUSPENDED(job_ptr)) + /* resume any job that is suspended by us */ + if (IS_JOB_SUSPENDED(job_ptr) && + (job_ptr->priority != 0)) { + debug3("resuming job %u apparently suspended " + " by gang", job_ptr->job_id); _resume_job(job_ptr->job_id); + } /* transfer the job as long as it is still active */ if (IS_JOB_SUSPENDED(job_ptr) || -- GitLab