From 9cd903cb394fece00e25ddeafec33150d9e04d76 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 30 Nov 2004 22:56:45 +0000
Subject: [PATCH] Define new function for plugin to terminate jobs on
 initiation failure.

---
 src/slurmctld/job_mgr.c   | 35 +++++++++++++++++++++++++++++++++++
 src/slurmctld/proc_req.c  | 27 +++++++++++++++++++++++++--
 src/slurmctld/proc_req.h  | 22 +++++++++++++++++++++-
 src/slurmctld/slurmctld.h | 17 ++++++++---------
 4 files changed, 89 insertions(+), 12 deletions(-)

diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 37758ea0541..b3a84914ad8 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -1268,6 +1268,41 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, int will_run,
 	return SLURM_SUCCESS;
 }
 
+/*
+ * job_fail - terminate a job due to initiation failure
+ * IN job_id - id of the job to be killed
+ * RET 0 on success, otherwise ESLURM error code
+ */
+extern int job_fail(uint32_t job_id)
+{
+	struct job_record *job_ptr;
+	time_t now = time(NULL);
+
+	job_ptr = find_job_record(job_id);
+	if (job_ptr == NULL) {
+		error("job_fail: invalid job id %u", job_id);
+		return ESLURM_INVALID_JOB_ID;
+	}
+
+	if (IS_JOB_FINISHED(job_ptr))
+		return ESLURM_ALREADY_DONE;
+	if (job_ptr->job_state == JOB_RUNNING) {
+		/* No need to signal steps, deallocate kills them */
+		job_ptr->time_last_active       = now;
+		job_ptr->end_time               = now;
+		last_job_update                 = now;
+		job_ptr->job_state = JOB_FAILED | JOB_COMPLETING;
+		deallocate_nodes(job_ptr, false);
+		job_completion_logger(job_ptr);
+		return SLURM_SUCCESS;
+	}
+	/* All other states */
+	verbose("job_fail: job %u can't be killed from state=%s",
+		job_id, job_state_string(job_ptr->job_state));
+	return ESLURM_TRANSITION_STATE_NO_UPDATE;
+
+}
+
 /* 
  * job_signal - signal the specified job
  * IN job_id - id of the job to be signaled
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index 63970823b5d..ee31935b74d 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -1507,11 +1507,12 @@ static void _slurm_rpc_update_job(slurm_msg_t * msg)
 
 /*
  * slurm_drain_nodes - process a request to drain a list of nodes,
- *  no-op for nodes already drained or draining
+ *	no-op for nodes already drained or draining
  * node_list IN - list of nodes to drain
  * reason IN - reason to drain the nodes
  * RET SLURM_SUCCESS or error code
- * NOTE: This is utilzed by plugins and not via RPC
+ * NOTE: This is utilzed by plugins and not via RPC and it sets its 
+ *	own locks.
  */
 extern int slurm_drain_nodes(char *node_list, char *reason)
 {
@@ -1527,6 +1528,28 @@ extern int slurm_drain_nodes(char *node_list, char *reason)
 	return error_code;
 }
 
+/*
+ * slurm_fail_job - terminate a job due to a launch failure
+ *      no-op for jobs already terminated
+ * job_id IN - slurm job id
+ * RET SLURM_SUCCESS or error code
+ * NOTE: This is utilzed by plugins and not via RPC and it sets its
+ *      own locks.
+ */
+extern int slurm_fail_job(uint32_t job_id)
+{
+	int error_code;
+	/* Locks: Write job and node */
+	slurmctld_lock_t job_write_lock = {
+		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };
+
+	lock_slurmctld(job_write_lock);
+	error_code = job_fail(job_id);
+	unlock_slurmctld(job_write_lock);
+
+	return error_code;
+}
+
 /* _slurm_rpc_update_node - process RPC to update the configuration of a 
  *	node (e.g. UP/DOWN) */
 static void _slurm_rpc_update_node(slurm_msg_t * msg)
diff --git a/src/slurmctld/proc_req.h b/src/slurmctld/proc_req.h
index 324f7a5efaf..5be6ad8709f 100644
--- a/src/slurmctld/proc_req.h
+++ b/src/slurmctld/proc_req.h
@@ -3,7 +3,7 @@
  *****************************************************************************
  *  Copyright (C) 2002 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- *  Written by Moe Jette <jette@llnl.gov>, Kevin Tew <tew1@llnl.gov>, et. al.
+ *  Written by Morris Jette <jette1@llnl.gov> and Kevin Tew <tew1@llnl.gov> 
  *  UCRL-CODE-2002-040.
  *  
  *  This file is part of SLURM, a resource management program.
@@ -62,5 +62,25 @@ extern inline void diff_tv_str(struct timeval *tv1,struct timeval *tv2,
  */
 void slurmctld_req (slurm_msg_t * msg);
 
+/*
+ * slurm_drain_nodes - process a request to drain a list of nodes,
+ *	no-op for nodes already drained or draining
+ * node_list IN - list of nodes to drain
+ * reason IN - reason to drain the nodes
+ * RET SLURM_SUCCESS or error code
+ * NOTE: This is utilzed by plugins and not via RPC and it sets its
+ *	own locks.
+ */
+extern int slurm_drain_nodes(char *node_list, char *reason);
+
+/*
+ * slurm_fail_job - terminate a job due to a launch failure
+ *	no-op for jobs already terminated
+ * job_id IN - slurm job id
+ * RET SLURM_SUCCESS or error code
+ * NOTE: This is utilzed by plugins and not via RPC and it sets its
+ *	own locks.
+ */
+extern int slurm_fail_job(uint32_t job_id);
 #endif /* !_HAVE_PROC_REQ_H */
 
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 8a84bdb0bbf..9a23b1082c8 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -3,7 +3,7 @@
  *****************************************************************************
  *  Copyright (C) 2002 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- *  Written by Morris Jette <jette@llnl.gov> et. al.
+ *  Written by Morris Jette <jette1@llnl.gov> et. al.
  *  UCRL-CODE-2002-040.
  *  
  *  This file is part of SLURM, a resource management program.
@@ -620,6 +620,13 @@ extern void job_fini (void);
  */
 extern bool job_is_completing(void);
 
+/*
+ * job_fail - terminate a job due to initiation failure
+ * IN job_id - id of the job to be killed
+ * RET 0 on success, otherwise ESLURM error code
+ */
+extern int job_fail(uint32_t job_id);
+
 /* 
  * job_signal - signal the specified job
  * IN job_id - id of the job to be signaled
@@ -1029,14 +1036,6 @@ extern void signal_step_tasks(struct step_record *step_ptr, uint16_t signal);
  */
 extern int slurmctld_shutdown(void);
 
-/*
- * slurm_drain_nodes - process a request to drain a list of nodes
- * node_list IN - list of nodes to drain
- * reason IN - reason to drain the nodes
- * RET SLURM_SUCCESS or error code
- */
-extern int slurm_drain_nodes(char *node_list, char *reason);
-
 /*
  * step_create - creates a step_record in step_specs->job_id, sets up the
  *	accoding to the step_specs.
-- 
GitLab