From 5d4db9cdcbbd7ff82158d8c57b2605b6c4ba0e71 Mon Sep 17 00:00:00 2001
From: Brian Christiansen <brian@schedmd.com>
Date: Tue, 4 Apr 2017 23:27:39 -0600
Subject: [PATCH] Add scancel --sibling=<sib_name>

to remove active sibling from federated job.
---
 doc/man/man1/scancel.1   |  4 ++++
 src/scancel/opt.c        | 16 +++++++++++----
 src/scancel/scancel.c    |  1 +
 src/scancel/scancel.h    |  1 +
 src/slurmctld/fed_mgr.c  | 42 ++++++++++++++++++++++++++++++++++++++++
 src/slurmctld/fed_mgr.h  |  1 +
 src/slurmctld/proc_req.c | 15 +++++++++-----
 7 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/doc/man/man1/scancel.1 b/doc/man/man1/scancel.1
index d54d51f870f..6bb80179c43 100644
--- a/doc/man/man1/scancel.1
+++ b/doc/man/man1/scancel.1
@@ -90,6 +90,10 @@ This option is incompatible with the \fB\-\-verbose\fR option.
 \fB\-R\fR, \fB\-\-reservation\fR=\fIreservation_name\fR
 Restrict the scancel operation to jobs with this reservation name.
 
+.TP
+\fB\-\-sibling\fR=\fIcluster_name\fR
+Remove an active sibling job from a federated job.
+
 .TP
 \fB\-s\fR, \fB\-\-signal\fR=\fIsignal_name\fR
 The name or number of the signal to send.  If this option is not used
diff --git a/src/scancel/opt.c b/src/scancel/opt.c
index 3de55ddc460..bece7b8b00b 100644
--- a/src/scancel/opt.c
+++ b/src/scancel/opt.c
@@ -61,10 +61,11 @@
 #include "src/scancel/scancel.h"
 
 /* getopt_long options, integers but not characters */
-#define OPT_LONG_HELP  0x100
-#define OPT_LONG_USAGE 0x101
-#define OPT_LONG_CTLD  0x102
-#define OPT_LONG_WCKEY 0x103
+#define OPT_LONG_HELP    0x100
+#define OPT_LONG_USAGE   0x101
+#define OPT_LONG_CTLD    0x102
+#define OPT_LONG_WCKEY   0x103
+#define OPT_LONG_SIBLING 0x104
 
 #define SIZE(a) (sizeof(a)/sizeof(a[0]))
 
@@ -255,6 +256,7 @@ static void _opt_default(void)
 	opt.partition	= NULL;
 	opt.qos		= NULL;
 	opt.reservation	= NULL;
+	opt.sibling     = NULL;
 	opt.signal	= (uint16_t) NO_VAL;
 	opt.state	= JOB_END;
 	opt.user_id	= 0;
@@ -385,6 +387,7 @@ static void _opt_args(int argc, char **argv)
 		{"qos",         required_argument, 0, 'q'},
 		{"quiet",       no_argument,       0, 'Q'},
 		{"reservation", required_argument, 0, 'R'},
+		{"sibling",     required_argument, 0, OPT_LONG_SIBLING},
 		{"signal",      required_argument, 0, 's'},
 		{"state",       required_argument, 0, 't'},
 		{"usage",       no_argument,       0, OPT_LONG_USAGE},
@@ -465,6 +468,9 @@ static void _opt_args(int argc, char **argv)
 		case (int)'w':
 			opt.nodelist = xstrdup(optarg);
 			break;
+		case OPT_LONG_SIBLING:
+			opt.sibling = xstrdup(optarg);
+			break;
 		case OPT_LONG_WCKEY:
 			opt.wckey = xstrdup(optarg);
 			break;
@@ -652,6 +658,7 @@ static void _opt_list(void)
 	info("partition      : %s", opt.partition);
 	info("qos            : %s", opt.qos);
 	info("reservation    : %s", opt.reservation);
+	info("sibling        : %s", opt.sibling);
 	if (opt.signal != (uint16_t) NO_VAL)
 		info("signal         : %u", opt.signal);
 	info("state          : %s", job_state_string(opt.state));
@@ -713,6 +720,7 @@ static void _help(void)
 	printf("  -Q, --quiet                     disable warnings\n");
 	printf("  -q, --qos=qos                   act only on jobs with this quality of service\n");
 	printf("  -R, --reservation=reservation   act only on jobs with this reservation\n");
+	printf("      --sibling=cluster_name      remove an active sibling job from a federated job\n");
 	printf("  -s, --signal=name | integer     signal to send to job, default is SIGKILL\n");
 	printf("  -t, --state=states              act only on jobs in this state.  Valid job\n");
 	printf("                                  states are PENDING, RUNNING and SUSPENDED\n");
diff --git a/src/scancel/scancel.c b/src/scancel/scancel.c
index 5e0125378f0..7113849fcdd 100644
--- a/src/scancel/scancel.c
+++ b/src/scancel/scancel.c
@@ -755,6 +755,7 @@ _cancel_job_id (void *ci)
 		kill_msg.flags	= flags;
 		kill_msg.job_id      = NO_VAL;
 		kill_msg.job_step_id = NO_VAL;
+		kill_msg.sibling     = opt.sibling;
 		kill_msg.signal      = cancel_info->sig;
 		kill_msg.sjob_id     = cancel_info->job_id_str;
 
diff --git a/src/scancel/scancel.h b/src/scancel/scancel.h
index 2aa03ce79bd..7e4a8a3dd5b 100644
--- a/src/scancel/scancel.h
+++ b/src/scancel/scancel.h
@@ -46,6 +46,7 @@
 typedef struct scancel_options {
 	char *account;		/* --account=n, -a		*/
 	bool batch;		/* --batch, -b			*/
+	char *sibling;		/* --sibling=<sib_name>		*/
 	bool ctld;		/* --ctld			*/
 	List clusters;          /* --cluster=cluster_name -Mcluster-name */
 	bool full;		/* --full, -f			*/
diff --git a/src/slurmctld/fed_mgr.c b/src/slurmctld/fed_mgr.c
index 3732085530f..5a02a624313 100644
--- a/src/slurmctld/fed_mgr.c
+++ b/src/slurmctld/fed_mgr.c
@@ -3828,3 +3828,45 @@ end_it:
 	return SLURM_SUCCESS;
 }
 
+/*
+ * Remove active sibling from the given job.
+ *
+ * IN job_id   - job_id of job to remove active sibling from.
+ * IN sib_name - name of sibling job to remove from active siblings.
+ * RET SLURM_SUCCESS on sucess, error code on error.
+ */
+extern int fed_mgr_remove_active_sibling(uint32_t job_id, char *sib_name)
+{
+	uint32_t origin_id;
+	struct job_record *job_ptr = NULL;
+	slurmdb_cluster_rec_t *sibling;
+
+	if (!(job_ptr = find_job_record(job_id)))
+		return ESLURM_INVALID_JOB_ID;
+
+	if (!_is_fed_job(job_ptr, &origin_id))
+		return ESLURM_JOB_NOT_FEDERATED;
+
+	if (job_ptr->fed_details->cluster_lock)
+		return ESLURM_JOB_NOT_PENDING;
+
+	if (!(sibling = fed_mgr_get_cluster_by_name(sib_name)))
+		return ESLURM_INVALID_CLUSTER_NAME;
+
+	if (job_ptr->fed_details->siblings_active &
+	    FED_SIBLING_BIT(sibling->fed.id)) {
+		if (fed_mgr_cluster_rec == sibling)
+			fed_mgr_job_revoke(job_ptr, false, 0, 0);
+		else
+			_revoke_sibling_jobs(job_ptr->job_id,
+					     fed_mgr_cluster_rec->fed.id,
+					     FED_SIBLING_BIT(sibling->fed.id),
+					     0);
+		job_ptr->fed_details->siblings_active &=
+			~(FED_SIBLING_BIT(sibling->fed.id));
+		update_job_fed_details(job_ptr);
+	}
+
+	return SLURM_SUCCESS;
+}
+
diff --git a/src/slurmctld/fed_mgr.h b/src/slurmctld/fed_mgr.h
index 9a5fa2b8da3..a9b22222ab3 100644
--- a/src/slurmctld/fed_mgr.h
+++ b/src/slurmctld/fed_mgr.h
@@ -78,6 +78,7 @@ extern int       fed_mgr_job_revoke(struct job_record *job_ptr,
 extern int       fed_mgr_job_revoke_sibs(struct job_record *job_ptr);
 extern int       fed_mgr_job_start(struct job_record *job_ptr,
 				   uint32_t cluster_id, time_t start_time);
+extern int       fed_mgr_remove_active_sibling(uint32_t job_id, char *sib_name);
 extern int       fed_mgr_sib_will_run(slurm_msg_t *msg,
 				      job_desc_msg_t *job_desc, uid_t uid,
 				      will_run_response_msg_t **resp);
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index 771dc521434..7cad5d329f3 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -5582,11 +5582,16 @@ _slurm_rpc_kill_job2(slurm_msg_t *msg)
 
 	_throttle_start(&active_rpc_cnt);
 	lock_slurmctld(lock);
-	cc = job_str_signal(kill->sjob_id,
-			    kill->signal,
-			    kill->flags,
-			    uid,
-			    0);
+	if (kill->sibling) {
+		uint32_t job_id = strtol(kill->sjob_id, NULL, 10);
+		cc = fed_mgr_remove_active_sibling(job_id, kill->sibling);
+	} else {
+		cc = job_str_signal(kill->sjob_id,
+				    kill->signal,
+				    kill->flags,
+				    uid,
+				    0);
+	}
 	unlock_slurmctld(lock);
 	_throttle_fini(&active_rpc_cnt);
 
-- 
GitLab