From bead18c0cbc79cf0ba6fcb31d2c7149b91193b45 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 17 Jun 2008 19:54:53 +0000
Subject: [PATCH] Cancel pending, running and suspended job when their
 association is deleted.

---
 src/common/assoc_mgr.c     | 16 ++++++++++++++--
 src/common/assoc_mgr.h     |  9 +++++++--
 src/slurmctld/controller.c | 23 +++++++++++++++++++++--
 src/slurmctld/job_mgr.c    | 28 ++++++++++++++++++++++++++++
 src/slurmctld/slurmctld.h  |  8 ++++++++
 src/slurmdbd/slurmdbd.c    |  2 +-
 6 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c
index 6c9e4b92aa7..64efaf6aa33 100644
--- a/src/common/assoc_mgr.c
+++ b/src/common/assoc_mgr.c
@@ -47,6 +47,8 @@ static List local_association_list = NULL;
 static List local_user_list = NULL;
 static char *local_cluster_name = NULL;
 
+void (*remove_assoc_notify) (acct_association_rec_t *rec) = NULL;
+
 static pthread_mutex_t local_association_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t local_user_lock = PTHREAD_MUTEX_INITIALIZER;
 
@@ -185,8 +187,16 @@ static int _get_local_user_list(void *db_conn, int enforce)
 	return SLURM_SUCCESS;
 }
 
-extern int assoc_mgr_init(void *db_conn, int enforce)
+extern int assoc_mgr_init(void *db_conn, assoc_init_args_t *args)
 {
+	int enforce = 0;
+
+	if(args) {
+		enforce = args->enforce;
+		if(args->remove_assoc_notify)
+			remove_assoc_notify = args->remove_assoc_notify;
+	}
+
 	if(!local_cluster_name && !slurmdbd_conf)
 		local_cluster_name = slurm_get_cluster_name();
 
@@ -201,7 +211,7 @@ extern int assoc_mgr_init(void *db_conn, int enforce)
 	return SLURM_SUCCESS;
 }
 
-extern int assoc_mgr_fini()
+extern int assoc_mgr_fini(void)
 {
 	if(local_association_list) 
 		list_destroy(local_association_list);
@@ -574,6 +584,8 @@ extern int assoc_mgr_update_local_assocs(acct_update_object_t *update)
 				//rc = SLURM_ERROR;
 				break;
 			}
+			if (remove_assoc_notify)
+				remove_assoc_notify(rec);
 			list_delete_item(itr);
 			break;
 		default:
diff --git a/src/common/assoc_mgr.h b/src/common/assoc_mgr.h
index baec27dc783..512b753d424 100644
--- a/src/common/assoc_mgr.h
+++ b/src/common/assoc_mgr.h
@@ -49,6 +49,11 @@
 #include <slurm/slurm.h>
 #include <slurm/slurm_errno.h>
 
+typedef struct {
+	int enforce;
+	void (*remove_assoc_notify) (acct_association_rec_t *rec);
+} assoc_init_args_t;
+
 /* 
  * get info from the storage 
  * IN/OUT:  user - acct_user_rec_t with the name set of the user.
@@ -92,8 +97,8 @@ extern acct_admin_level_t assoc_mgr_get_admin_level(void *db_conn,
 extern int assoc_mgr_is_user_acct_coord(void *db_conn, uint32_t uid,
 					char *acct);
 
-extern int assoc_mgr_init(void *db_conn, int enforce);
-extern int assoc_mgr_fini();
+extern int assoc_mgr_init(void *db_conn, assoc_init_args_t *args);
+extern int assoc_mgr_fini(void);
 
 /* 
  * update associations in local cache 
diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index b710214002f..cb3a28bb337 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -173,6 +173,7 @@ static void         _init_config(void);
 static void         _init_pidfile(void);
 static void         _kill_old_slurmctld(void);
 static void         _parse_commandline(int argc, char *argv[]);
+static void         _remove_assoc(acct_association_rec_t *rec);
 inline static int   _report_locks_set(void);
 static void *       _service_connection(void *arg);
 static int          _shutdown_backup_controller(int wait_time);
@@ -196,6 +197,7 @@ int main(int argc, char *argv[])
 	/* Locks: Write configuration, job, node, and partition */
 	slurmctld_lock_t config_write_lock = {
 		WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK };
+	assoc_init_args_t assoc_init_arg;
 
 	/*
 	 * Establish initial configuration
@@ -297,7 +299,9 @@ int main(int argc, char *argv[])
 	slurmctld_cluster_name = xstrdup(slurmctld_conf.cluster_name);
 	accounting_enforce = slurmctld_conf.accounting_storage_enforce;
 	acct_db_conn = acct_storage_g_get_connection(true, false);
-	if (assoc_mgr_init(acct_db_conn, accounting_enforce) &&
+	assoc_init_arg.enforce = accounting_enforce;
+	assoc_init_arg.remove_assoc_notify = _remove_assoc;
+	if (assoc_mgr_init(acct_db_conn, &assoc_init_arg) &&
 	    accounting_enforce) {
 		error("assoc_mgr_init failure");
 		fatal("slurmdbd and/or database must be up at "
@@ -391,7 +395,7 @@ int main(int argc, char *argv[])
 		if(!acct_db_conn) {
 			acct_db_conn = 
 				acct_storage_g_get_connection(true, false);
-			if (assoc_mgr_init(acct_db_conn, accounting_enforce) &&
+			if (assoc_mgr_init(acct_db_conn, &assoc_init_arg) &&
 			    accounting_enforce) {
 				error("assoc_mgr_init failure");
 				fatal("slurmdbd and/or database must be up at "
@@ -948,6 +952,21 @@ static int _accounting_mark_all_nodes_down(char *reason)
 	}
 	return rc;
 }
+
+static void _remove_assoc(acct_association_rec_t *rec)
+{
+	int cnt = 0;
+
+	if (accounting_enforce)
+		cnt = job_cancel_by_assoc_id(rec->id);
+
+	if (cnt) {
+		info("Removed association id:%u user:%s, cancelled %u jobs",
+		     rec->id, rec->user, cnt);
+	} else
+		debug("Removed association id:%u user:%s", rec->id, rec->user);
+}
+
 /*
  * _slurmctld_background - process slurmctld background activities
  *	purge defunct job records, save state, schedule jobs, and 
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index e8041885f34..39363695caa 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -5349,3 +5349,31 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc,
 
 	return true;
 }
+
+/*
+ * job_cancel_by_assoc_id - Cancel all pending and running jobs with a given
+ *	association ID. This happens when an association is deleted (e.g. when
+ *	a user is removed from the association database).
+ * RET count of cancelled jobs
+ */
+extern int job_cancel_by_assoc_id(uint32_t assoc_id)
+{
+	int cnt = 0;
+	ListIterator job_iterator;
+	struct job_record *job_ptr;
+
+	if (!job_list)
+		return cnt;
+
+	job_iterator = list_iterator_create(job_list);
+	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
+		if ((job_ptr->assoc_id != assoc_id) || 
+		    IS_JOB_FINISHED(job_ptr))
+			continue;
+		info("Association deleted, cancelling job %u", job_ptr->job_id);
+		job_signal(job_ptr->job_id, SIGKILL, 0, 0);
+		cnt++;
+	}
+	list_iterator_destroy(job_iterator);
+	return cnt;
+}
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 6ff3e1ee44d..cd16cf87723 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -794,6 +794,14 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
 		int will_run, will_run_response_msg_t **resp, 
 		int allocate, uid_t submit_uid, struct job_record **job_pptr);
 
+/*
+ * job_cancel_by_assoc_id - Cancel all pending and running jobs with a given
+ *	association ID. This happens when an association is deleted (e.g. when
+ *	a user is removed from the association database).
+ * RET count of cancelled jobs
+ */
+extern int job_cancel_by_assoc_id(uint32_t assoc_id);
+
 /* log the completion of the specified job */
 extern void job_completion_logger(struct job_record  *job_ptr);
 
diff --git a/src/slurmdbd/slurmdbd.c b/src/slurmdbd/slurmdbd.c
index 6738308ef6a..ae2a361ca39 100644
--- a/src/slurmdbd/slurmdbd.c
+++ b/src/slurmdbd/slurmdbd.c
@@ -139,7 +139,7 @@ int main(int argc, char *argv[])
 
 	db_conn = acct_storage_g_get_connection(false, false);
 	
-	if(assoc_mgr_init(db_conn, 0) == SLURM_ERROR) {
+	if(assoc_mgr_init(db_conn, NULL) == SLURM_ERROR) {
 		error("Problem getting cache of data");
 		acct_storage_g_close_connection(&db_conn);
 		goto end_it;
-- 
GitLab