From 5d4db9cdcbbd7ff82158d8c57b2605b6c4ba0e71 Mon Sep 17 00:00:00 2001 From: Brian Christiansen <brian@schedmd.com> Date: Tue, 4 Apr 2017 23:27:39 -0600 Subject: [PATCH] Add scancel --sibling=<sib_name> to remove active sibling from federated job. --- doc/man/man1/scancel.1 | 4 ++++ src/scancel/opt.c | 16 +++++++++++---- src/scancel/scancel.c | 1 + src/scancel/scancel.h | 1 + src/slurmctld/fed_mgr.c | 42 ++++++++++++++++++++++++++++++++++++++++ src/slurmctld/fed_mgr.h | 1 + src/slurmctld/proc_req.c | 15 +++++++++----- 7 files changed, 71 insertions(+), 9 deletions(-) diff --git a/doc/man/man1/scancel.1 b/doc/man/man1/scancel.1 index d54d51f870f..6bb80179c43 100644 --- a/doc/man/man1/scancel.1 +++ b/doc/man/man1/scancel.1 @@ -90,6 +90,10 @@ This option is incompatible with the \fB\-\-verbose\fR option. \fB\-R\fR, \fB\-\-reservation\fR=\fIreservation_name\fR Restrict the scancel operation to jobs with this reservation name. +.TP +\fB\-\-sibling\fR=\fIcluster_name\fR +Remove an active sibling job from a federated job. + .TP \fB\-s\fR, \fB\-\-signal\fR=\fIsignal_name\fR The name or number of the signal to send. If this option is not used diff --git a/src/scancel/opt.c b/src/scancel/opt.c index 3de55ddc460..bece7b8b00b 100644 --- a/src/scancel/opt.c +++ b/src/scancel/opt.c @@ -61,10 +61,11 @@ #include "src/scancel/scancel.h" /* getopt_long options, integers but not characters */ -#define OPT_LONG_HELP 0x100 -#define OPT_LONG_USAGE 0x101 -#define OPT_LONG_CTLD 0x102 -#define OPT_LONG_WCKEY 0x103 +#define OPT_LONG_HELP 0x100 +#define OPT_LONG_USAGE 0x101 +#define OPT_LONG_CTLD 0x102 +#define OPT_LONG_WCKEY 0x103 +#define OPT_LONG_SIBLING 0x104 #define SIZE(a) (sizeof(a)/sizeof(a[0])) @@ -255,6 +256,7 @@ static void _opt_default(void) opt.partition = NULL; opt.qos = NULL; opt.reservation = NULL; + opt.sibling = NULL; opt.signal = (uint16_t) NO_VAL; opt.state = JOB_END; opt.user_id = 0; @@ -385,6 +387,7 @@ static void _opt_args(int argc, char **argv) {"qos", required_argument, 0, 'q'}, {"quiet", no_argument, 0, 'Q'}, {"reservation", required_argument, 0, 'R'}, + {"sibling", required_argument, 0, OPT_LONG_SIBLING}, {"signal", required_argument, 0, 's'}, {"state", required_argument, 0, 't'}, {"usage", no_argument, 0, OPT_LONG_USAGE}, @@ -465,6 +468,9 @@ static void _opt_args(int argc, char **argv) case (int)'w': opt.nodelist = xstrdup(optarg); break; + case OPT_LONG_SIBLING: + opt.sibling = xstrdup(optarg); + break; case OPT_LONG_WCKEY: opt.wckey = xstrdup(optarg); break; @@ -652,6 +658,7 @@ static void _opt_list(void) info("partition : %s", opt.partition); info("qos : %s", opt.qos); info("reservation : %s", opt.reservation); + info("sibling : %s", opt.sibling); if (opt.signal != (uint16_t) NO_VAL) info("signal : %u", opt.signal); info("state : %s", job_state_string(opt.state)); @@ -713,6 +720,7 @@ static void _help(void) printf(" -Q, --quiet disable warnings\n"); printf(" -q, --qos=qos act only on jobs with this quality of service\n"); printf(" -R, --reservation=reservation act only on jobs with this reservation\n"); + printf(" --sibling=cluster_name remove an active sibling job from a federated job\n"); printf(" -s, --signal=name | integer signal to send to job, default is SIGKILL\n"); printf(" -t, --state=states act only on jobs in this state. Valid job\n"); printf(" states are PENDING, RUNNING and SUSPENDED\n"); diff --git a/src/scancel/scancel.c b/src/scancel/scancel.c index 5e0125378f0..7113849fcdd 100644 --- a/src/scancel/scancel.c +++ b/src/scancel/scancel.c @@ -755,6 +755,7 @@ _cancel_job_id (void *ci) kill_msg.flags = flags; kill_msg.job_id = NO_VAL; kill_msg.job_step_id = NO_VAL; + kill_msg.sibling = opt.sibling; kill_msg.signal = cancel_info->sig; kill_msg.sjob_id = cancel_info->job_id_str; diff --git a/src/scancel/scancel.h b/src/scancel/scancel.h index 2aa03ce79bd..7e4a8a3dd5b 100644 --- a/src/scancel/scancel.h +++ b/src/scancel/scancel.h @@ -46,6 +46,7 @@ typedef struct scancel_options { char *account; /* --account=n, -a */ bool batch; /* --batch, -b */ + char *sibling; /* --sibling=<sib_name> */ bool ctld; /* --ctld */ List clusters; /* --cluster=cluster_name -Mcluster-name */ bool full; /* --full, -f */ diff --git a/src/slurmctld/fed_mgr.c b/src/slurmctld/fed_mgr.c index 3732085530f..5a02a624313 100644 --- a/src/slurmctld/fed_mgr.c +++ b/src/slurmctld/fed_mgr.c @@ -3828,3 +3828,45 @@ end_it: return SLURM_SUCCESS; } +/* + * Remove active sibling from the given job. + * + * IN job_id - job_id of job to remove active sibling from. + * IN sib_name - name of sibling job to remove from active siblings. + * RET SLURM_SUCCESS on sucess, error code on error. + */ +extern int fed_mgr_remove_active_sibling(uint32_t job_id, char *sib_name) +{ + uint32_t origin_id; + struct job_record *job_ptr = NULL; + slurmdb_cluster_rec_t *sibling; + + if (!(job_ptr = find_job_record(job_id))) + return ESLURM_INVALID_JOB_ID; + + if (!_is_fed_job(job_ptr, &origin_id)) + return ESLURM_JOB_NOT_FEDERATED; + + if (job_ptr->fed_details->cluster_lock) + return ESLURM_JOB_NOT_PENDING; + + if (!(sibling = fed_mgr_get_cluster_by_name(sib_name))) + return ESLURM_INVALID_CLUSTER_NAME; + + if (job_ptr->fed_details->siblings_active & + FED_SIBLING_BIT(sibling->fed.id)) { + if (fed_mgr_cluster_rec == sibling) + fed_mgr_job_revoke(job_ptr, false, 0, 0); + else + _revoke_sibling_jobs(job_ptr->job_id, + fed_mgr_cluster_rec->fed.id, + FED_SIBLING_BIT(sibling->fed.id), + 0); + job_ptr->fed_details->siblings_active &= + ~(FED_SIBLING_BIT(sibling->fed.id)); + update_job_fed_details(job_ptr); + } + + return SLURM_SUCCESS; +} + diff --git a/src/slurmctld/fed_mgr.h b/src/slurmctld/fed_mgr.h index 9a5fa2b8da3..a9b22222ab3 100644 --- a/src/slurmctld/fed_mgr.h +++ b/src/slurmctld/fed_mgr.h @@ -78,6 +78,7 @@ extern int fed_mgr_job_revoke(struct job_record *job_ptr, extern int fed_mgr_job_revoke_sibs(struct job_record *job_ptr); extern int fed_mgr_job_start(struct job_record *job_ptr, uint32_t cluster_id, time_t start_time); +extern int fed_mgr_remove_active_sibling(uint32_t job_id, char *sib_name); extern int fed_mgr_sib_will_run(slurm_msg_t *msg, job_desc_msg_t *job_desc, uid_t uid, will_run_response_msg_t **resp); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 771dc521434..7cad5d329f3 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -5582,11 +5582,16 @@ _slurm_rpc_kill_job2(slurm_msg_t *msg) _throttle_start(&active_rpc_cnt); lock_slurmctld(lock); - cc = job_str_signal(kill->sjob_id, - kill->signal, - kill->flags, - uid, - 0); + if (kill->sibling) { + uint32_t job_id = strtol(kill->sjob_id, NULL, 10); + cc = fed_mgr_remove_active_sibling(job_id, kill->sibling); + } else { + cc = job_str_signal(kill->sjob_id, + kill->signal, + kill->flags, + uid, + 0); + } unlock_slurmctld(lock); _throttle_fini(&active_rpc_cnt); -- GitLab