Skip to content
Snippets Groups Projects
Commit e6d85f72 authored by Moe Jette's avatar Moe Jette
Browse files

Job signal functions properly. All steps signalled, if SIGKILL the

allocation is release only upon termination of the last step.
parent bc29ffca
No related branches found
No related tags found
No related merge requests found
...@@ -940,7 +940,8 @@ static void _slurm_rpc_job_step_kill(slurm_msg_t * msg) ...@@ -940,7 +940,8 @@ static void _slurm_rpc_job_step_kill(slurm_msg_t * msg)
/* do RPC call */ /* do RPC call */
if (job_step_kill_msg->job_step_id == NO_VAL) { if (job_step_kill_msg->job_step_id == NO_VAL) {
error_code = job_cancel(job_step_kill_msg->job_id, uid); error_code = job_signal(job_step_kill_msg->job_id,
job_step_kill_msg->signal, uid);
unlock_slurmctld(job_write_lock); unlock_slurmctld(job_write_lock);
/* return result */ /* return result */
......
...@@ -1101,20 +1101,21 @@ int job_allocate(job_desc_msg_t * job_specs, uint32_t * new_job_id, ...@@ -1101,20 +1101,21 @@ int job_allocate(job_desc_msg_t * job_specs, uint32_t * new_job_id,
/* /*
* job_cancel - cancel the specified job * job_signal - signal the specified job
* IN job_id - id of the job to be cancelled * IN job_id - id of the job to be signaled
* IN signal - signal to send, SIGKILL == cancel the job
* IN uid - uid of requesting user * IN uid - uid of requesting user
* RET 0 on success, otherwise ESLURM error code * RET 0 on success, otherwise ESLURM error code
* global: job_list - pointer global job list * global: job_list - pointer global job list
* last_job_update - time of last job table update * last_job_update - time of last job table update
*/ */
int job_cancel(uint32_t job_id, uid_t uid) int job_signal(uint32_t job_id, uint16_t signal, uid_t uid)
{ {
struct job_record *job_ptr; struct job_record *job_ptr;
job_ptr = find_job_record(job_id); job_ptr = find_job_record(job_id);
if (job_ptr == NULL) { if (job_ptr == NULL) {
info("job_cancel: invalid job id %u", job_id); info("job_signal: invalid job id %u", job_id);
return ESLURM_INVALID_JOB_ID; return ESLURM_INVALID_JOB_ID;
} }
...@@ -1129,28 +1130,47 @@ int job_cancel(uint32_t job_id, uid_t uid) ...@@ -1129,28 +1130,47 @@ int job_cancel(uint32_t job_id, uid_t uid)
return ESLURM_USER_ID_MISSING; return ESLURM_USER_ID_MISSING;
} }
if (job_ptr->job_state == JOB_PENDING) { if ((job_ptr->job_state == JOB_PENDING) &&
(signal == SIGKILL)) {
last_job_update = time(NULL); last_job_update = time(NULL);
job_ptr->job_state = JOB_FAILED; job_ptr->job_state = JOB_FAILED;
job_ptr->start_time = job_ptr->end_time = time(NULL); job_ptr->start_time = job_ptr->end_time = time(NULL);
delete_job_details(job_ptr); delete_job_details(job_ptr);
verbose("job_cancel of pending job %u successful", job_id); verbose("job_signal of pending job %u successful", job_id);
return SLURM_SUCCESS; return SLURM_SUCCESS;
} }
if (job_ptr->job_state == JOB_RUNNING) { if (job_ptr->job_state == JOB_RUNNING) {
last_job_update = time(NULL); ListIterator step_record_iterator;
job_ptr->job_state = JOB_FAILED; struct step_record *step_ptr;
job_ptr->end_time = time(NULL); int step_cnt = 0;
deallocate_nodes(job_ptr);
delete_all_step_records(job_ptr); step_record_iterator =
delete_job_details(job_ptr); list_iterator_create (job_ptr->step_list);
verbose("job_cancel of running job %u successful", job_id); while ((step_ptr = (struct step_record *)
list_next (step_record_iterator))) {
signal_step_tasks(step_ptr, signal);
step_cnt++;
}
list_iterator_destroy (step_record_iterator);
if (signal == SIGKILL) {
job_ptr->kill_on_step_done = 1;
last_job_update = time(NULL);
}
if ((signal == SIGKILL) && (step_cnt == 0)) {
/* kill job with no active steps */
job_ptr->job_state = JOB_COMPLETE;
job_ptr->end_time = time(NULL);
deallocate_nodes(job_ptr);
delete_job_details(job_ptr);
}
verbose("job_signal of running job %u successful", job_id);
return SLURM_SUCCESS; return SLURM_SUCCESS;
} }
verbose("job_cancel: job %u can't be cancelled from state=%s", verbose("job_signal: job %u can't be sent signal %u from state=%s",
job_id, job_state_string(job_ptr->job_state)); job_id, signal, job_state_string(job_ptr->job_state));
return ESLURM_TRANSITION_STATE_NO_UPDATE; return ESLURM_TRANSITION_STATE_NO_UPDATE;
} }
......
...@@ -580,14 +580,15 @@ extern int job_allocate(job_desc_msg_t * job_specs, uint32_t * new_job_id, ...@@ -580,14 +580,15 @@ extern int job_allocate(job_desc_msg_t * job_specs, uint32_t * new_job_id,
uint16_t * node_cnt, slurm_addr ** node_addr); uint16_t * node_cnt, slurm_addr ** node_addr);
/* /*
* job_cancel - cancel the specified job * job_signal - signal the specified job
* IN job_id - id of the job to be cancelled * IN job_id - id of the job to be signaled
* IN signal - signal to send, SIGKILL == cancel the job
* IN uid - uid of requesting user * IN uid - uid of requesting user
* RET 0 on success, otherwise ESLURM error code * RET 0 on success, otherwise ESLURM error code
* global: job_list - pointer global job list * global: job_list - pointer global job list
* last_job_update - time of last job table update * last_job_update - time of last job table update
*/ */
extern int job_cancel (uint32_t job_id, uid_t uid); extern int job_signal(uint32_t job_id, uint16_t signal, uid_t uid);
/* /*
* job_step_cancel - cancel the specified job step * job_step_cancel - cancel the specified job step
...@@ -886,6 +887,13 @@ extern int set_batch_job_sid(uid_t uid, uint32_t job_id, uint32_t batch_sid); ...@@ -886,6 +887,13 @@ extern int set_batch_job_sid(uid_t uid, uint32_t job_id, uint32_t batch_sid);
* Uses common data structures. */ * Uses common data structures. */
extern void set_slurmd_addr (void); extern void set_slurmd_addr (void);
/*
* signal_step_tasks - send specific signal to specific job step
* IN step_ptr - step record pointer
* IN signal - signal to send
*/
extern void signal_step_tasks(struct step_record *step_ptr, uint16_t signal);
/* /*
* step_create - creates a step_record in step_specs->job_id, sets up the * step_create - creates a step_record in step_specs->job_id, sets up the
* accoding to the step_specs. * accoding to the step_specs.
......
...@@ -51,8 +51,6 @@ ...@@ -51,8 +51,6 @@
static void _pack_ctld_job_step_info(struct step_record *step, Buf buffer); static void _pack_ctld_job_step_info(struct step_record *step, Buf buffer);
static bitstr_t * _pick_step_nodes (struct job_record *job_ptr, static bitstr_t * _pick_step_nodes (struct job_record *job_ptr,
step_specs *step_spec ); step_specs *step_spec );
static void _signal_step_tasks(struct step_record *step_ptr, uint16_t signal);
/* /*
* create_step_record - create an empty step_record for the specified job. * create_step_record - create an empty step_record for the specified job.
* IN job_ptr - pointer to job table entry to have step record added * IN job_ptr - pointer to job table entry to have step record added
...@@ -235,12 +233,17 @@ int job_step_signal(uint32_t job_id, uint32_t step_id, ...@@ -235,12 +233,17 @@ int job_step_signal(uint32_t job_id, uint32_t step_id,
return ESLURM_ALREADY_DONE; return ESLURM_ALREADY_DONE;
} }
_signal_step_tasks(step_ptr, signal); signal_step_tasks(step_ptr, signal);
return SLURM_SUCCESS; return SLURM_SUCCESS;
} }
static void _signal_step_tasks(struct step_record *step_ptr, uint16_t signal) /*
* signal_step_tasks - send specific signal to specific job step
* IN step_ptr - step record pointer
* IN signal - signal to send
*/
void signal_step_tasks(struct step_record *step_ptr, uint16_t signal)
{ {
int i; int i;
kill_tasks_msg_t *kill_tasks_msg; kill_tasks_msg_t *kill_tasks_msg;
...@@ -329,7 +332,8 @@ int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid, ...@@ -329,7 +332,8 @@ int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid,
return ESLURM_INVALID_JOB_ID; return ESLURM_INVALID_JOB_ID;
} }
if (job_ptr->kill_on_step_done) if ((job_ptr->kill_on_step_done) &&
(list_count(job_ptr->step_list) <= 1))
return job_complete(job_id, uid, requeue, job_return_code); return job_complete(job_id, uid, requeue, job_return_code);
if ((job_ptr->job_state == JOB_FAILED) || if ((job_ptr->job_state == JOB_FAILED) ||
...@@ -517,8 +521,12 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record, ...@@ -517,8 +521,12 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record,
return ESLURM_BAD_DIST; return ESLURM_BAD_DIST;
#endif #endif
job_ptr->time_last_active = time(NULL); if (job_ptr->kill_on_step_done)
/* Don't start more steps, job already being cancelled */
return ESLURM_ALREADY_DONE;
job_ptr->kill_on_step_done = kill_job_when_step_done; job_ptr->kill_on_step_done = kill_job_when_step_done;
job_ptr->time_last_active = time(NULL);
nodeset = _pick_step_nodes (job_ptr, step_specs); nodeset = _pick_step_nodes (job_ptr, step_specs);
if (nodeset == NULL) if (nodeset == NULL)
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE ; return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE ;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment