From 79528a6cf38c7dfb5dcafa1559acf02d226e1932 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 23 Nov 2004 02:14:34 +0000
Subject: [PATCH] Modify logic to exit gracefully (without mutex hang) when the
 plugin is   terminated.

---
 src/plugins/jobcomp/script/jobcomp_script.c | 39 +++++++++++++--------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/plugins/jobcomp/script/jobcomp_script.c b/src/plugins/jobcomp/script/jobcomp_script.c
index ba64d66ede6..ac1388e3eb4 100644
--- a/src/plugins/jobcomp/script/jobcomp_script.c
+++ b/src/plugins/jobcomp/script/jobcomp_script.c
@@ -97,7 +97,7 @@ static pthread_t script_thread = 0;
 static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t job_list_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t job_list_cond = PTHREAD_COND_INITIALIZER;
-
+static int agent_exit = 0;
 
 /*
  * Check if the script exists and if we can execute it.
@@ -264,19 +264,22 @@ void *script_agent (void *args) {
 	char user_id_str[32],job_id_str[32], nodes_cache[1];
 	char start_str[32], end_str[32], lim_str[32];
 	char submit_str[32], *batch_str;
-	char * argvp[] = {script,NULL};
+	char * argvp[] = {script, NULL};
 	int status;
 	char ** envp, * nodes;
 	job_record job;
 
 	while(1) {
 		pthread_mutex_lock(&job_list_mutex);
-		while(list_is_empty(job_list) != 0) {
-			pthread_cond_wait(&job_list_cond,&job_list_mutex);
+		while ((list_is_empty(job_list) != 0) && (agent_exit == 0)) {
+			pthread_cond_wait(&job_list_cond, &job_list_mutex);
+		}
+		if (agent_exit) {
+			pthread_mutex_unlock(&job_list_mutex);
+			return NULL;
 		}
 		job = (job_record)list_pop(job_list);
 		pthread_mutex_unlock(&job_list_mutex);
-		
 
 		snprintf(user_id_str,sizeof(user_id_str),"%u",job->user_id);
 		snprintf(job_id_str,sizeof(job_id_str),"%u",job->job_id);
@@ -443,34 +446,40 @@ char *slurm_jobcomp_strerror( int errnum )
 	return error_str;
 }
 
-static void _cancel_thread (pthread_t thread_id)
+static int _wait_for_thread (pthread_t thread_id)
 {
 	int i;
 
 	for (i=0; i<4; i++) {
-		if (pthread_cancel(thread_id))
-			return;
+		if (pthread_kill(thread_id, 0))
+			return SLURM_SUCCESS;
 		usleep(1000);
 	}
 	error("Could not kill jobcomp script pthread");
+	return SLURM_ERROR;
 }
 
 /* Called when script unloads */
 int fini ( void )
 {
+	int rc = SLURM_SUCCESS;
+
 	pthread_mutex_lock(&thread_flag_mutex);
-	if(script_thread) {
+	if (script_thread) {
 		verbose("Script Job Completion plugin shutting down");
-		_cancel_thread(script_thread);
+		agent_exit = 1;
+		pthread_cond_broadcast(&job_list_cond);
+		rc = _wait_for_thread(script_thread);
 		script_thread = 0;
 	}
 	pthread_mutex_unlock(&thread_flag_mutex);
 
 	xfree(script);
+	if (rc == SLURM_SUCCESS) {
+		pthread_mutex_lock(&job_list_mutex);
+		list_destroy(job_list);
+		pthread_mutex_unlock(&job_list_mutex);
+	}
 
-	pthread_mutex_lock(&job_list_mutex);
-	list_destroy(job_list);
-	pthread_mutex_unlock(&job_list_mutex);
-
-	return SLURM_SUCCESS;
+	return rc;
 }
-- 
GitLab