From bead18c0cbc79cf0ba6fcb31d2c7149b91193b45 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 17 Jun 2008 19:54:53 +0000 Subject: [PATCH] Cancel pending, running and suspended job when their association is deleted. --- src/common/assoc_mgr.c | 16 ++++++++++++++-- src/common/assoc_mgr.h | 9 +++++++-- src/slurmctld/controller.c | 23 +++++++++++++++++++++-- src/slurmctld/job_mgr.c | 28 ++++++++++++++++++++++++++++ src/slurmctld/slurmctld.h | 8 ++++++++ src/slurmdbd/slurmdbd.c | 2 +- 6 files changed, 79 insertions(+), 7 deletions(-) diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c index 6c9e4b92aa7..64efaf6aa33 100644 --- a/src/common/assoc_mgr.c +++ b/src/common/assoc_mgr.c @@ -47,6 +47,8 @@ static List local_association_list = NULL; static List local_user_list = NULL; static char *local_cluster_name = NULL; +void (*remove_assoc_notify) (acct_association_rec_t *rec) = NULL; + static pthread_mutex_t local_association_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t local_user_lock = PTHREAD_MUTEX_INITIALIZER; @@ -185,8 +187,16 @@ static int _get_local_user_list(void *db_conn, int enforce) return SLURM_SUCCESS; } -extern int assoc_mgr_init(void *db_conn, int enforce) +extern int assoc_mgr_init(void *db_conn, assoc_init_args_t *args) { + int enforce = 0; + + if(args) { + enforce = args->enforce; + if(args->remove_assoc_notify) + remove_assoc_notify = args->remove_assoc_notify; + } + if(!local_cluster_name && !slurmdbd_conf) local_cluster_name = slurm_get_cluster_name(); @@ -201,7 +211,7 @@ extern int assoc_mgr_init(void *db_conn, int enforce) return SLURM_SUCCESS; } -extern int assoc_mgr_fini() +extern int assoc_mgr_fini(void) { if(local_association_list) list_destroy(local_association_list); @@ -574,6 +584,8 @@ extern int assoc_mgr_update_local_assocs(acct_update_object_t *update) //rc = SLURM_ERROR; break; } + if (remove_assoc_notify) + remove_assoc_notify(rec); list_delete_item(itr); break; default: diff --git a/src/common/assoc_mgr.h b/src/common/assoc_mgr.h index baec27dc783..512b753d424 100644 --- a/src/common/assoc_mgr.h +++ b/src/common/assoc_mgr.h @@ -49,6 +49,11 @@ #include <slurm/slurm.h> #include <slurm/slurm_errno.h> +typedef struct { + int enforce; + void (*remove_assoc_notify) (acct_association_rec_t *rec); +} assoc_init_args_t; + /* * get info from the storage * IN/OUT: user - acct_user_rec_t with the name set of the user. @@ -92,8 +97,8 @@ extern acct_admin_level_t assoc_mgr_get_admin_level(void *db_conn, extern int assoc_mgr_is_user_acct_coord(void *db_conn, uint32_t uid, char *acct); -extern int assoc_mgr_init(void *db_conn, int enforce); -extern int assoc_mgr_fini(); +extern int assoc_mgr_init(void *db_conn, assoc_init_args_t *args); +extern int assoc_mgr_fini(void); /* * update associations in local cache diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index b710214002f..cb3a28bb337 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -173,6 +173,7 @@ static void _init_config(void); static void _init_pidfile(void); static void _kill_old_slurmctld(void); static void _parse_commandline(int argc, char *argv[]); +static void _remove_assoc(acct_association_rec_t *rec); inline static int _report_locks_set(void); static void * _service_connection(void *arg); static int _shutdown_backup_controller(int wait_time); @@ -196,6 +197,7 @@ int main(int argc, char *argv[]) /* Locks: Write configuration, job, node, and partition */ slurmctld_lock_t config_write_lock = { WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK }; + assoc_init_args_t assoc_init_arg; /* * Establish initial configuration @@ -297,7 +299,9 @@ int main(int argc, char *argv[]) slurmctld_cluster_name = xstrdup(slurmctld_conf.cluster_name); accounting_enforce = slurmctld_conf.accounting_storage_enforce; acct_db_conn = acct_storage_g_get_connection(true, false); - if (assoc_mgr_init(acct_db_conn, accounting_enforce) && + assoc_init_arg.enforce = accounting_enforce; + assoc_init_arg.remove_assoc_notify = _remove_assoc; + if (assoc_mgr_init(acct_db_conn, &assoc_init_arg) && accounting_enforce) { error("assoc_mgr_init failure"); fatal("slurmdbd and/or database must be up at " @@ -391,7 +395,7 @@ int main(int argc, char *argv[]) if(!acct_db_conn) { acct_db_conn = acct_storage_g_get_connection(true, false); - if (assoc_mgr_init(acct_db_conn, accounting_enforce) && + if (assoc_mgr_init(acct_db_conn, &assoc_init_arg) && accounting_enforce) { error("assoc_mgr_init failure"); fatal("slurmdbd and/or database must be up at " @@ -948,6 +952,21 @@ static int _accounting_mark_all_nodes_down(char *reason) } return rc; } + +static void _remove_assoc(acct_association_rec_t *rec) +{ + int cnt = 0; + + if (accounting_enforce) + cnt = job_cancel_by_assoc_id(rec->id); + + if (cnt) { + info("Removed association id:%u user:%s, cancelled %u jobs", + rec->id, rec->user, cnt); + } else + debug("Removed association id:%u user:%s", rec->id, rec->user); +} + /* * _slurmctld_background - process slurmctld background activities * purge defunct job records, save state, schedule jobs, and diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index e8041885f34..39363695caa 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -5349,3 +5349,31 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc, return true; } + +/* + * job_cancel_by_assoc_id - Cancel all pending and running jobs with a given + * association ID. This happens when an association is deleted (e.g. when + * a user is removed from the association database). + * RET count of cancelled jobs + */ +extern int job_cancel_by_assoc_id(uint32_t assoc_id) +{ + int cnt = 0; + ListIterator job_iterator; + struct job_record *job_ptr; + + if (!job_list) + return cnt; + + job_iterator = list_iterator_create(job_list); + while ((job_ptr = (struct job_record *) list_next(job_iterator))) { + if ((job_ptr->assoc_id != assoc_id) || + IS_JOB_FINISHED(job_ptr)) + continue; + info("Association deleted, cancelling job %u", job_ptr->job_id); + job_signal(job_ptr->job_id, SIGKILL, 0, 0); + cnt++; + } + list_iterator_destroy(job_iterator); + return cnt; +} diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 6ff3e1ee44d..cd16cf87723 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -794,6 +794,14 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, int will_run, will_run_response_msg_t **resp, int allocate, uid_t submit_uid, struct job_record **job_pptr); +/* + * job_cancel_by_assoc_id - Cancel all pending and running jobs with a given + * association ID. This happens when an association is deleted (e.g. when + * a user is removed from the association database). + * RET count of cancelled jobs + */ +extern int job_cancel_by_assoc_id(uint32_t assoc_id); + /* log the completion of the specified job */ extern void job_completion_logger(struct job_record *job_ptr); diff --git a/src/slurmdbd/slurmdbd.c b/src/slurmdbd/slurmdbd.c index 6738308ef6a..ae2a361ca39 100644 --- a/src/slurmdbd/slurmdbd.c +++ b/src/slurmdbd/slurmdbd.c @@ -139,7 +139,7 @@ int main(int argc, char *argv[]) db_conn = acct_storage_g_get_connection(false, false); - if(assoc_mgr_init(db_conn, 0) == SLURM_ERROR) { + if(assoc_mgr_init(db_conn, NULL) == SLURM_ERROR) { error("Problem getting cache of data"); acct_storage_g_close_connection(&db_conn); goto end_it; -- GitLab