diff --git a/NEWS b/NEWS index fa4797d77b916eb0e7ca41464e5e89339633138f..5a74cb7960b2852614d139ac947e544e1745e7ce 100644 --- a/NEWS +++ b/NEWS @@ -37,6 +37,11 @@ documents those changes that are of interest to users and admins. -- BLUEGENE - Correct method to update conn_type of a job. -- BLUEGENE - Fix issue with preemption when needing to preempt multiple jobs to make one job run. + -- Fixed issue where if an srun dies inside of an allocation abnormally it + would of also killed the allocation. + -- FRONTEND - fixed issue where if a systems nodes weren't defined in the + slurm.conf with NodeAddr's signals going to a step could be handled + incorrectly. * Changes in SLURM 2.5.0 ======================== diff --git a/src/srun/libsrun/srun_job.c b/src/srun/libsrun/srun_job.c index 09a8edebb65f46f50b707372a4c3fd2e1b60ed8f..7d6a5ae508b2424702fe8dfb3bac17a4b52f6eed 100644 --- a/src/srun/libsrun/srun_job.c +++ b/src/srun/libsrun/srun_job.c @@ -592,7 +592,7 @@ extern void create_srun_job(srun_job_t **p_job, bool *got_alloc, * Spawn process to insure clean-up of job and/or step * on abnormal termination */ - shepard_fd = _shepard_spawn(job, got_alloc); + shepard_fd = _shepard_spawn(job, *got_alloc); } *p_job = job; @@ -1305,7 +1305,8 @@ static int _shepard_spawn(srun_job_t *job, bool got_alloc) } } - (void) slurm_terminate_job_step(job->jobid, job->stepid); + (void) slurm_kill_job_step(job->jobid, job->stepid, SIGKILL); + if (got_alloc) slurm_complete_job(job->jobid, NO_VAL); exit(0); diff --git a/src/sview/job_info.c b/src/sview/job_info.c index b90554da8421a4e34597853c1b476d8d2017855f..aa8273552af845fdfb4cad9c8ca1451b3e58dc56 100644 --- a/src/sview/job_info.c +++ b/src/sview/job_info.c @@ -597,13 +597,8 @@ static int _cancel_step_id(uint32_t job_id, uint32_t step_id, for (i = 0; i < MAX_CANCEL_RETRY; i++) { /* NOTE: RPC always sent to slurmctld rather than directly * to slurmd daemons */ - if (signal == SIGKILL) { - error_code = slurm_terminate_job_step(job_id, step_id); + error_code = slurm_kill_job_step(job_id, step_id, signal); - } else { - error_code = slurm_kill_job_step(job_id, step_id, - signal); - } if (error_code == 0 || (errno != ESLURM_TRANSITION_STATE_NO_UPDATE && errno != ESLURM_JOB_PENDING))