diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 5a87e746ebea312d328abcbf1762560774726231..d601ffb95c92c4a99961cb9145ac8f0eea4563c4 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -940,7 +940,8 @@ static void _slurm_rpc_job_step_kill(slurm_msg_t * msg) /* do RPC call */ if (job_step_kill_msg->job_step_id == NO_VAL) { - error_code = job_cancel(job_step_kill_msg->job_id, uid); + error_code = job_signal(job_step_kill_msg->job_id, + job_step_kill_msg->signal, uid); unlock_slurmctld(job_write_lock); /* return result */ diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index b105322aa103f0cce93416841510f57c2eaa65e5..a9635115b27f3a252c21b541bb752eafa42f84ca 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1101,20 +1101,21 @@ int job_allocate(job_desc_msg_t * job_specs, uint32_t * new_job_id, /* - * job_cancel - cancel the specified job - * IN job_id - id of the job to be cancelled + * job_signal - signal the specified job + * IN job_id - id of the job to be signaled + * IN signal - signal to send, SIGKILL == cancel the job * IN uid - uid of requesting user * RET 0 on success, otherwise ESLURM error code * global: job_list - pointer global job list * last_job_update - time of last job table update */ -int job_cancel(uint32_t job_id, uid_t uid) +int job_signal(uint32_t job_id, uint16_t signal, uid_t uid) { struct job_record *job_ptr; job_ptr = find_job_record(job_id); if (job_ptr == NULL) { - info("job_cancel: invalid job id %u", job_id); + info("job_signal: invalid job id %u", job_id); return ESLURM_INVALID_JOB_ID; } @@ -1129,28 +1130,47 @@ int job_cancel(uint32_t job_id, uid_t uid) return ESLURM_USER_ID_MISSING; } - if (job_ptr->job_state == JOB_PENDING) { + if ((job_ptr->job_state == JOB_PENDING) && + (signal == SIGKILL)) { last_job_update = time(NULL); job_ptr->job_state = JOB_FAILED; job_ptr->start_time = job_ptr->end_time = time(NULL); delete_job_details(job_ptr); - verbose("job_cancel of pending job %u successful", job_id); + verbose("job_signal of pending job %u successful", job_id); return SLURM_SUCCESS; } if (job_ptr->job_state == JOB_RUNNING) { - last_job_update = time(NULL); - job_ptr->job_state = JOB_FAILED; - job_ptr->end_time = time(NULL); - deallocate_nodes(job_ptr); - delete_all_step_records(job_ptr); - delete_job_details(job_ptr); - verbose("job_cancel of running job %u successful", job_id); + ListIterator step_record_iterator; + struct step_record *step_ptr; + int step_cnt = 0; + + step_record_iterator = + list_iterator_create (job_ptr->step_list); + while ((step_ptr = (struct step_record *) + list_next (step_record_iterator))) { + signal_step_tasks(step_ptr, signal); + step_cnt++; + } + list_iterator_destroy (step_record_iterator); + + if (signal == SIGKILL) { + job_ptr->kill_on_step_done = 1; + last_job_update = time(NULL); + } + if ((signal == SIGKILL) && (step_cnt == 0)) { + /* kill job with no active steps */ + job_ptr->job_state = JOB_COMPLETE; + job_ptr->end_time = time(NULL); + deallocate_nodes(job_ptr); + delete_job_details(job_ptr); + } + verbose("job_signal of running job %u successful", job_id); return SLURM_SUCCESS; } - verbose("job_cancel: job %u can't be cancelled from state=%s", - job_id, job_state_string(job_ptr->job_state)); + verbose("job_signal: job %u can't be sent signal %u from state=%s", + job_id, signal, job_state_string(job_ptr->job_state)); return ESLURM_TRANSITION_STATE_NO_UPDATE; } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 3e190b2af36658384baca295c812bb017c88322b..27b78407c64722498e3f7e77e551c3b9d263fc2d 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -580,14 +580,15 @@ extern int job_allocate(job_desc_msg_t * job_specs, uint32_t * new_job_id, uint16_t * node_cnt, slurm_addr ** node_addr); /* - * job_cancel - cancel the specified job - * IN job_id - id of the job to be cancelled + * job_signal - signal the specified job + * IN job_id - id of the job to be signaled + * IN signal - signal to send, SIGKILL == cancel the job * IN uid - uid of requesting user * RET 0 on success, otherwise ESLURM error code * global: job_list - pointer global job list * last_job_update - time of last job table update */ -extern int job_cancel (uint32_t job_id, uid_t uid); +extern int job_signal(uint32_t job_id, uint16_t signal, uid_t uid); /* * job_step_cancel - cancel the specified job step @@ -886,6 +887,13 @@ extern int set_batch_job_sid(uid_t uid, uint32_t job_id, uint32_t batch_sid); * Uses common data structures. */ extern void set_slurmd_addr (void); +/* + * signal_step_tasks - send specific signal to specific job step + * IN step_ptr - step record pointer + * IN signal - signal to send + */ +extern void signal_step_tasks(struct step_record *step_ptr, uint16_t signal); + /* * step_create - creates a step_record in step_specs->job_id, sets up the * accoding to the step_specs. diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 6d94444814d3c961e12861a94ed6cf99c7f00b71..c722f26f5cb2d264b3cc45612a0bae0a7dca25b5 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -51,8 +51,6 @@ static void _pack_ctld_job_step_info(struct step_record *step, Buf buffer); static bitstr_t * _pick_step_nodes (struct job_record *job_ptr, step_specs *step_spec ); -static void _signal_step_tasks(struct step_record *step_ptr, uint16_t signal); - /* * create_step_record - create an empty step_record for the specified job. * IN job_ptr - pointer to job table entry to have step record added @@ -235,12 +233,17 @@ int job_step_signal(uint32_t job_id, uint32_t step_id, return ESLURM_ALREADY_DONE; } - _signal_step_tasks(step_ptr, signal); + signal_step_tasks(step_ptr, signal); return SLURM_SUCCESS; } -static void _signal_step_tasks(struct step_record *step_ptr, uint16_t signal) +/* + * signal_step_tasks - send specific signal to specific job step + * IN step_ptr - step record pointer + * IN signal - signal to send + */ +void signal_step_tasks(struct step_record *step_ptr, uint16_t signal) { int i; kill_tasks_msg_t *kill_tasks_msg; @@ -329,7 +332,8 @@ int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid, return ESLURM_INVALID_JOB_ID; } - if (job_ptr->kill_on_step_done) + if ((job_ptr->kill_on_step_done) && + (list_count(job_ptr->step_list) <= 1)) return job_complete(job_id, uid, requeue, job_return_code); if ((job_ptr->job_state == JOB_FAILED) || @@ -517,8 +521,12 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record, return ESLURM_BAD_DIST; #endif - job_ptr->time_last_active = time(NULL); + if (job_ptr->kill_on_step_done) + /* Don't start more steps, job already being cancelled */ + return ESLURM_ALREADY_DONE; job_ptr->kill_on_step_done = kill_job_when_step_done; + + job_ptr->time_last_active = time(NULL); nodeset = _pick_step_nodes (job_ptr, step_specs); if (nodeset == NULL) return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE ;