Skip to content
Snippets Groups Projects
Commit 79528a6c authored by Moe Jette's avatar Moe Jette
Browse files

Modify logic to exit gracefully (without mutex hang) when the plugin is

  terminated.
parent 97141d24
No related branches found
No related tags found
No related merge requests found
...@@ -97,7 +97,7 @@ static pthread_t script_thread = 0; ...@@ -97,7 +97,7 @@ static pthread_t script_thread = 0;
static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t job_list_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t job_list_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t job_list_cond = PTHREAD_COND_INITIALIZER; static pthread_cond_t job_list_cond = PTHREAD_COND_INITIALIZER;
static int agent_exit = 0;
/* /*
* Check if the script exists and if we can execute it. * Check if the script exists and if we can execute it.
...@@ -264,19 +264,22 @@ void *script_agent (void *args) { ...@@ -264,19 +264,22 @@ void *script_agent (void *args) {
char user_id_str[32],job_id_str[32], nodes_cache[1]; char user_id_str[32],job_id_str[32], nodes_cache[1];
char start_str[32], end_str[32], lim_str[32]; char start_str[32], end_str[32], lim_str[32];
char submit_str[32], *batch_str; char submit_str[32], *batch_str;
char * argvp[] = {script,NULL}; char * argvp[] = {script, NULL};
int status; int status;
char ** envp, * nodes; char ** envp, * nodes;
job_record job; job_record job;
while(1) { while(1) {
pthread_mutex_lock(&job_list_mutex); pthread_mutex_lock(&job_list_mutex);
while(list_is_empty(job_list) != 0) { while ((list_is_empty(job_list) != 0) && (agent_exit == 0)) {
pthread_cond_wait(&job_list_cond,&job_list_mutex); pthread_cond_wait(&job_list_cond, &job_list_mutex);
}
if (agent_exit) {
pthread_mutex_unlock(&job_list_mutex);
return NULL;
} }
job = (job_record)list_pop(job_list); job = (job_record)list_pop(job_list);
pthread_mutex_unlock(&job_list_mutex); pthread_mutex_unlock(&job_list_mutex);
snprintf(user_id_str,sizeof(user_id_str),"%u",job->user_id); snprintf(user_id_str,sizeof(user_id_str),"%u",job->user_id);
snprintf(job_id_str,sizeof(job_id_str),"%u",job->job_id); snprintf(job_id_str,sizeof(job_id_str),"%u",job->job_id);
...@@ -443,34 +446,40 @@ char *slurm_jobcomp_strerror( int errnum ) ...@@ -443,34 +446,40 @@ char *slurm_jobcomp_strerror( int errnum )
return error_str; return error_str;
} }
static void _cancel_thread (pthread_t thread_id) static int _wait_for_thread (pthread_t thread_id)
{ {
int i; int i;
for (i=0; i<4; i++) { for (i=0; i<4; i++) {
if (pthread_cancel(thread_id)) if (pthread_kill(thread_id, 0))
return; return SLURM_SUCCESS;
usleep(1000); usleep(1000);
} }
error("Could not kill jobcomp script pthread"); error("Could not kill jobcomp script pthread");
return SLURM_ERROR;
} }
/* Called when script unloads */ /* Called when script unloads */
int fini ( void ) int fini ( void )
{ {
int rc = SLURM_SUCCESS;
pthread_mutex_lock(&thread_flag_mutex); pthread_mutex_lock(&thread_flag_mutex);
if(script_thread) { if (script_thread) {
verbose("Script Job Completion plugin shutting down"); verbose("Script Job Completion plugin shutting down");
_cancel_thread(script_thread); agent_exit = 1;
pthread_cond_broadcast(&job_list_cond);
rc = _wait_for_thread(script_thread);
script_thread = 0; script_thread = 0;
} }
pthread_mutex_unlock(&thread_flag_mutex); pthread_mutex_unlock(&thread_flag_mutex);
xfree(script); xfree(script);
if (rc == SLURM_SUCCESS) {
pthread_mutex_lock(&job_list_mutex);
list_destroy(job_list);
pthread_mutex_unlock(&job_list_mutex);
}
pthread_mutex_lock(&job_list_mutex); return rc;
list_destroy(job_list);
pthread_mutex_unlock(&job_list_mutex);
return SLURM_SUCCESS;
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment