From 9cd903cb394fece00e25ddeafec33150d9e04d76 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 30 Nov 2004 22:56:45 +0000 Subject: [PATCH] Define new function for plugin to terminate jobs on initiation failure. --- src/slurmctld/job_mgr.c | 35 +++++++++++++++++++++++++++++++++++ src/slurmctld/proc_req.c | 27 +++++++++++++++++++++++++-- src/slurmctld/proc_req.h | 22 +++++++++++++++++++++- src/slurmctld/slurmctld.h | 17 ++++++++--------- 4 files changed, 89 insertions(+), 12 deletions(-) diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 37758ea0541..b3a84914ad8 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1268,6 +1268,41 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, int will_run, return SLURM_SUCCESS; } +/* + * job_fail - terminate a job due to initiation failure + * IN job_id - id of the job to be killed + * RET 0 on success, otherwise ESLURM error code + */ +extern int job_fail(uint32_t job_id) +{ + struct job_record *job_ptr; + time_t now = time(NULL); + + job_ptr = find_job_record(job_id); + if (job_ptr == NULL) { + error("job_fail: invalid job id %u", job_id); + return ESLURM_INVALID_JOB_ID; + } + + if (IS_JOB_FINISHED(job_ptr)) + return ESLURM_ALREADY_DONE; + if (job_ptr->job_state == JOB_RUNNING) { + /* No need to signal steps, deallocate kills them */ + job_ptr->time_last_active = now; + job_ptr->end_time = now; + last_job_update = now; + job_ptr->job_state = JOB_FAILED | JOB_COMPLETING; + deallocate_nodes(job_ptr, false); + job_completion_logger(job_ptr); + return SLURM_SUCCESS; + } + /* All other states */ + verbose("job_fail: job %u can't be killed from state=%s", + job_id, job_state_string(job_ptr->job_state)); + return ESLURM_TRANSITION_STATE_NO_UPDATE; + +} + /* * job_signal - signal the specified job * IN job_id - id of the job to be signaled diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 63970823b5d..ee31935b74d 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1507,11 +1507,12 @@ static void _slurm_rpc_update_job(slurm_msg_t * msg) /* * slurm_drain_nodes - process a request to drain a list of nodes, - * no-op for nodes already drained or draining + * no-op for nodes already drained or draining * node_list IN - list of nodes to drain * reason IN - reason to drain the nodes * RET SLURM_SUCCESS or error code - * NOTE: This is utilzed by plugins and not via RPC + * NOTE: This is utilzed by plugins and not via RPC and it sets its + * own locks. */ extern int slurm_drain_nodes(char *node_list, char *reason) { @@ -1527,6 +1528,28 @@ extern int slurm_drain_nodes(char *node_list, char *reason) return error_code; } +/* + * slurm_fail_job - terminate a job due to a launch failure + * no-op for jobs already terminated + * job_id IN - slurm job id + * RET SLURM_SUCCESS or error code + * NOTE: This is utilzed by plugins and not via RPC and it sets its + * own locks. + */ +extern int slurm_fail_job(uint32_t job_id) +{ + int error_code; + /* Locks: Write job and node */ + slurmctld_lock_t job_write_lock = { + NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; + + lock_slurmctld(job_write_lock); + error_code = job_fail(job_id); + unlock_slurmctld(job_write_lock); + + return error_code; +} + /* _slurm_rpc_update_node - process RPC to update the configuration of a * node (e.g. UP/DOWN) */ static void _slurm_rpc_update_node(slurm_msg_t * msg) diff --git a/src/slurmctld/proc_req.h b/src/slurmctld/proc_req.h index 324f7a5efaf..5be6ad8709f 100644 --- a/src/slurmctld/proc_req.h +++ b/src/slurmctld/proc_req.h @@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Moe Jette <jette@llnl.gov>, Kevin Tew <tew1@llnl.gov>, et. al. + * Written by Morris Jette <jette1@llnl.gov> and Kevin Tew <tew1@llnl.gov> * UCRL-CODE-2002-040. * * This file is part of SLURM, a resource management program. @@ -62,5 +62,25 @@ extern inline void diff_tv_str(struct timeval *tv1,struct timeval *tv2, */ void slurmctld_req (slurm_msg_t * msg); +/* + * slurm_drain_nodes - process a request to drain a list of nodes, + * no-op for nodes already drained or draining + * node_list IN - list of nodes to drain + * reason IN - reason to drain the nodes + * RET SLURM_SUCCESS or error code + * NOTE: This is utilzed by plugins and not via RPC and it sets its + * own locks. + */ +extern int slurm_drain_nodes(char *node_list, char *reason); + +/* + * slurm_fail_job - terminate a job due to a launch failure + * no-op for jobs already terminated + * job_id IN - slurm job id + * RET SLURM_SUCCESS or error code + * NOTE: This is utilzed by plugins and not via RPC and it sets its + * own locks. + */ +extern int slurm_fail_job(uint32_t job_id); #endif /* !_HAVE_PROC_REQ_H */ diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 8a84bdb0bbf..9a23b1082c8 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Morris Jette <jette@llnl.gov> et. al. + * Written by Morris Jette <jette1@llnl.gov> et. al. * UCRL-CODE-2002-040. * * This file is part of SLURM, a resource management program. @@ -620,6 +620,13 @@ extern void job_fini (void); */ extern bool job_is_completing(void); +/* + * job_fail - terminate a job due to initiation failure + * IN job_id - id of the job to be killed + * RET 0 on success, otherwise ESLURM error code + */ +extern int job_fail(uint32_t job_id); + /* * job_signal - signal the specified job * IN job_id - id of the job to be signaled @@ -1029,14 +1036,6 @@ extern void signal_step_tasks(struct step_record *step_ptr, uint16_t signal); */ extern int slurmctld_shutdown(void); -/* - * slurm_drain_nodes - process a request to drain a list of nodes - * node_list IN - list of nodes to drain - * reason IN - reason to drain the nodes - * RET SLURM_SUCCESS or error code - */ -extern int slurm_drain_nodes(char *node_list, char *reason); - /* * step_create - creates a step_record in step_specs->job_id, sets up the * accoding to the step_specs. -- GitLab