From 250a02121d2080af92f508226be3169ffd3bd742 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 5 Dec 2002 19:34:10 +0000 Subject: [PATCH] Add flag to job when initiated with allocated_resources_and_run_job_step(). If set, the job step termination is treated like job termination (i.e. the nodes are released and the job state set to complete). --- src/slurmctld/controller.c | 8 +++++--- src/slurmctld/slurmctld.h | 11 +++++++++-- src/slurmctld/step_mgr.c | 13 +++++++++++-- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 6526f471d4d..8b78a11487f 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -1026,6 +1026,7 @@ static void _slurm_rpc_job_step_complete(slurm_msg_t * msg) debug("Processing RPC: REQUEST_COMPLETE_JOB_STEP"); uid = slurm_auth_uid(msg->cred); lock_slurmctld(job_write_lock); + /* do RPC call */ /* First set node down as needed on fatal error */ if (complete_job_step_msg->slurm_rc != SLURM_SUCCESS) { @@ -1077,7 +1078,8 @@ static void _slurm_rpc_job_step_complete(slurm_msg_t * msg) error_code = job_step_complete(complete_job_step_msg->job_id, complete_job_step_msg->job_step_id, - uid); + uid, job_requeue, + complete_job_step_msg->job_rc); unlock_slurmctld(job_write_lock); /* return result */ @@ -1502,7 +1504,7 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg) req_step_msg.node_count = INFINITE; req_step_msg.cpu_count = job_desc_msg->num_procs; req_step_msg.num_tasks = job_desc_msg->num_tasks; - error_code = step_create(&req_step_msg, &step_rec); + error_code = step_create(&req_step_msg, &step_rec, true); /* note: no need to free step_rec, pointer to global job step record */ if (error_code) { unlock_slurmctld(job_write_lock); @@ -1841,7 +1843,7 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg) if (error_code == 0) { /* issue the RPC */ lock_slurmctld(job_write_lock); - error_code = step_create(req_step_msg, &step_rec); + error_code = step_create(req_step_msg, &step_rec, false); } /* return result */ diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index fcaf15b2bfc..92be49c958e 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -224,6 +224,8 @@ struct job_record { enum job_states job_state; /* state of the job */ uint16_t kill_on_node_fail; /* 1 if job should be killed on node failure */ + uint16_t kill_on_step_done; /* 1 if job should be killed when + the job step completes */ char *nodes; /* list of nodes allocated to job */ bitstr_t *node_bitmap; /* bitmap of nodes allocated to job */ uint32_t time_limit; /* time_limit minutes or INFINITE */ @@ -597,12 +599,14 @@ extern int job_complete (uint32_t job_id, uid_t uid, bool requeue, * IN job_id - id of the job to be completed * IN step_id - id of the job step to be completed * IN uid - user id of user issuing the RPC + * IN requeue - job should be run again if possible + * IN job_return_code - job's return code, if set then set state to JOB_FAILED * RET 0 on success, otherwise ESLURM error code * global: job_list - pointer global job list * last_job_update - time of last job table update */ extern int job_step_complete (uint32_t job_id, uint32_t job_step_id, - uid_t uid); + uid_t uid, bool requeue, uint32_t job_return_code); /* * job_time_limit - terminate jobs which have exceeded their time limit @@ -853,11 +857,14 @@ extern void set_slurmd_addr (void); * accoding to the step_specs. * IN step_specs - job step specifications * OUT new_step_record - pointer to the new step_record (NULL on error) + * IN kill_job_when_step_done - if set kill the job on step completion * RET - 0 or error code * NOTE: don't free the returned step_record because that is managed through * the job. */ -extern int step_create ( step_specs *step_specs, struct step_record** ); +extern int step_create ( step_specs *step_specs, + struct step_record** new_step_record, + bool kill_job_when_step_done ); /* * update_job - update a job's parameters per the supplied specifications diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 09d4f35f9c4..7460c4dd93a 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -249,11 +249,14 @@ int job_step_cancel(uint32_t job_id, uint32_t step_id, uid_t uid) * IN job_id - id of the job to be completed * IN step_id - id of the job step to be completed * IN uid - user id of user issuing the RPC + * IN requeue - job should be run again if possible + * IN job_return_code - job's return code, if set then set state to JOB_FAILED * RET 0 on success, otherwise ESLURM error code * global: job_list - pointer global job list * last_job_update - time of last job table update */ -int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid) +int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid, + bool requeue, uint32_t job_return_code) { struct job_record *job_ptr; int error_code; @@ -264,6 +267,9 @@ int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid) return ESLURM_INVALID_JOB_ID; } + if (job_ptr->kill_on_step_done) + return job_complete(job_id, uid, requeue, job_return_code); + if ((job_ptr->job_state == JOB_FAILED) || (job_ptr->job_state == JOB_COMPLETE) || (job_ptr->job_state == JOB_TIMEOUT)) @@ -413,12 +419,14 @@ cleanup: * according to the step_specs. * IN step_specs - job step specifications * OUT new_step_record - pointer to the new step_record (NULL on error) + * IN kill_job_when_step_done - if set kill the job on step completion * RET - 0 or error code * NOTE: don't free the returned step_record because that is managed through * the job. */ int -step_create ( step_specs *step_specs, struct step_record** new_step_record ) +step_create ( step_specs *step_specs, struct step_record** new_step_record, + bool kill_job_when_step_done ) { struct step_record *step_ptr; struct job_record *job_ptr; @@ -450,6 +458,7 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record ) #endif job_ptr->time_last_active = time(NULL); + job_ptr->kill_on_step_done = kill_job_when_step_done; nodeset = _pick_step_nodes (job_ptr, step_specs); if (nodeset == NULL) return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE ; -- GitLab