From 40cd97d6e8d9e184742a596aa3f3c7baf31a1f9d Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Wed, 25 Apr 2007 22:12:16 +0000
Subject: [PATCH] Major re-write of jobcomp/script plugin: fix memory leak and 
    general code clean-up - Mark Grondona.

---
 NEWS                                        |   2 +
 src/plugins/jobcomp/script/Makefile.am      |   2 +-
 src/plugins/jobcomp/script/Makefile.in      |   5 +-
 src/plugins/jobcomp/script/job_record.c     | 102 ----
 src/plugins/jobcomp/script/job_record.h     |  70 ---
 src/plugins/jobcomp/script/jobcomp_script.c | 526 ++++++++++++--------
 6 files changed, 323 insertions(+), 384 deletions(-)
 delete mode 100644 src/plugins/jobcomp/script/job_record.c
 delete mode 100644 src/plugins/jobcomp/script/job_record.h

diff --git a/NEWS b/NEWS
index a9f4cb2645b..3546b3bffcc 100644
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,8 @@ documents those changes that are of interest to users and admins.
  -- BLUEGENE - code to make it so you can make a 36x36x36 system.  
     The wiring isn't correct on the X axis for anything over 8 though.
     But that is only in emulation mode.
+ -- Major re-write of jobcomp/script plugin: fix memory leak and 
+    general code clean-up.
 
 * Changes in SLURM 1.2.6
 ========================
diff --git a/src/plugins/jobcomp/script/Makefile.am b/src/plugins/jobcomp/script/Makefile.am
index 9ad7aa90470..eae162b656f 100644
--- a/src/plugins/jobcomp/script/Makefile.am
+++ b/src/plugins/jobcomp/script/Makefile.am
@@ -9,5 +9,5 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common
 pkglib_LTLIBRARIES = jobcomp_script.la
 
 # Text file job completion logging plugin.
-jobcomp_script_la_SOURCES = job_record.c job_record.h jobcomp_script.c
+jobcomp_script_la_SOURCES = jobcomp_script.c
 jobcomp_script_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS)
diff --git a/src/plugins/jobcomp/script/Makefile.in b/src/plugins/jobcomp/script/Makefile.in
index 871dba84763..0a9a4cfbab6 100644
--- a/src/plugins/jobcomp/script/Makefile.in
+++ b/src/plugins/jobcomp/script/Makefile.in
@@ -78,7 +78,7 @@ am__installdirs = "$(DESTDIR)$(pkglibdir)"
 pkglibLTLIBRARIES_INSTALL = $(INSTALL)
 LTLIBRARIES = $(pkglib_LTLIBRARIES)
 jobcomp_script_la_LIBADD =
-am_jobcomp_script_la_OBJECTS = job_record.lo jobcomp_script.lo
+am_jobcomp_script_la_OBJECTS = jobcomp_script.lo
 jobcomp_script_la_OBJECTS = $(am_jobcomp_script_la_OBJECTS)
 DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir) -I$(top_builddir)/slurm
 depcomp = $(SHELL) $(top_srcdir)/auxdir/depcomp
@@ -285,7 +285,7 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common
 pkglib_LTLIBRARIES = jobcomp_script.la
 
 # Text file job completion logging plugin.
-jobcomp_script_la_SOURCES = job_record.c job_record.h jobcomp_script.c
+jobcomp_script_la_SOURCES = jobcomp_script.c
 jobcomp_script_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS)
 all: all-am
 
@@ -356,7 +356,6 @@ mostlyclean-compile:
 distclean-compile:
 	-rm -f *.tab.c
 
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/job_record.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jobcomp_script.Plo@am__quote@
 
 .c.o:
diff --git a/src/plugins/jobcomp/script/job_record.c b/src/plugins/jobcomp/script/job_record.c
deleted file mode 100644
index 32d987c8f2f..00000000000
--- a/src/plugins/jobcomp/script/job_record.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/******************************************************************************
- *  job_record.c - Functions to alter script plugin job record structure.
- ******************************************************************************
- *  Produced at Center for High Performance Computing, North Dakota State
- *  University
- *  Written by Nathan Huff <nhuff@acm.org>
- *  UCRL-CODE-226842.
- *
- *  This file is part of SLURM, a resource management program.
- *  For details, see <http://www.llnl.gov/linux/slurm/>.
- *
- *  SLURM is free software; you can redistribute it and/or modify it under
- *  the terms of the GNU General Public License as published by the Free
- *  Software Foundation; either version 2 of the License, or (at your option)
- *  any later version.
- *
- *  In addition, as a special exception, the copyright holders give permission 
- *  to link the code of portions of this program with the OpenSSL library under 
- *  certain conditions as described in each individual source file, and 
- *  distribute linked combinations including the two. You must obey the GNU 
- *  General Public License in all respects for all of the code used other than 
- *  OpenSSL. If you modify file(s) with this exception, you may extend this 
- *  exception to your version of the file(s), but you are not obligated to do 
- *  so. If you do not wish to do so, delete this exception statement from your
- *  version.  If you delete this exception statement from all source files in 
- *  the program, then also delete it here.
- *  
- *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
- *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- *  details.
- *  
- *  You should have received a copy of the GNU General Public License along
- *  with SLURM; if not, write to the Free Software Foundation, Inc.,
- *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
-\*****************************************************************************/
-
-#ifdef HAVE_CONFIG_H
-#	include "config.h"
-#endif
-
-#if HAVE_STDINT_H
-#	include <stdint.h>
-#endif
-#if HAVE_INTTYPES_H
-#	include <inttypes.h>
-#endif
-
-
-#include <unistd.h>
-#include <sys/types.h>
-#include <slurm/slurm.h>
-#include "job_record.h"
-#include "src/common/xmalloc.h"
-#include "src/common/xstring.h"
-
-/* 
- * Create a new job_record containing the job completion information
- */
-job_record job_record_create(uint32_t job_id, uint32_t user_id, char *job_name,
-	char *job_state, char *partition, 
-	uint32_t limit, time_t start, time_t end, time_t submit, 
-	uint16_t batch_flag, char *node_list, uint32_t num_procs, 
-	char *account)
-{
-	job_record ret;
-
-	ret = xmalloc(sizeof(struct job_record_));
-	ret->job_id = job_id;
-	ret->user_id = user_id;
-	ret->job_name = xstrdup(job_name);
-	ret->job_state = xstrdup(job_state);
-	ret->partition = xstrdup(partition);
-	ret->limit = limit;
-	ret->start = start;
-	ret->submit = submit;
-	ret->batch_flag = batch_flag;
-	ret->end = end;
-	ret->node_list = xstrdup(node_list);
-	ret->num_procs = num_procs;
-	if (account)
-		ret->account = xstrdup(account);
-
-	return ret;
-}
-
-/*
- * Free the memory from a job_record structure
- */
-void job_record_destroy(void *job) {
-	job_record j = job;
-
-	if (j == NULL)
-		return;
-
-	xfree(j->job_name);
-	xfree(j->partition);
-	xfree(j->node_list);
-	xfree(j->job_state);
-	xfree(j->account);
-	xfree(j);
-}
diff --git a/src/plugins/jobcomp/script/job_record.h b/src/plugins/jobcomp/script/job_record.h
deleted file mode 100644
index cfd762739e5..00000000000
--- a/src/plugins/jobcomp/script/job_record.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*****************************************************************************\
- *  job_record.h - Header for job_record structure.
- *****************************************************************************
- *  Produced at Center for High Performance Computing, North Dakota State 
- *  University
- *  Written by Nathan Huff <nhuff@geekshanty.com>
- *  
- *  This file is part of SLURM, a resource management program.
- *  For details, see <http://www.llnl.gov/linux/slurm/>.
- *  
- *  SLURM is free software; you can redistribute it and/or modify it under
- *  the terms of the GNU General Public License as published by the Free
- *  Software Foundation; either version 2 of the License, or (at your option)
- *  any later version.
- *
- *  In addition, as a special exception, the copyright holders give permission 
- *  to link the code of portions of this program with the OpenSSL library under 
- *  certain conditions as described in each individual source file, and 
- *  distribute linked combinations including the two. You must obey the GNU 
- *  General Public License in all respects for all of the code used other than 
- *  OpenSSL. If you modify file(s) with this exception, you may extend this 
- *  exception to your version of the file(s), but you are not obligated to do 
- *  so. If you do not wish to do so, delete this exception statement from your
- *  version.  If you delete this exception statement from all source files in 
- *  the program, then also delete it here.
- *  
- *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
- *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- *  details.
- *  
- *  You should have received a copy of the GNU General Public License along
- *  with SLURM; if not, write to the Free Software Foundation, Inc.,
- *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
-\*****************************************************************************/
-
-
-#ifndef _JOB_RECORD_H
-#define _JOB_RECORD_H
-
-/* Structure to contain job completion information */
-struct job_record_ {
-	uint32_t job_id;
-	uint32_t user_id;
-	char *job_name;
-	char *job_state;
-	char *partition;
-	uint32_t limit;
-	uint16_t batch_flag;
-	uint32_t num_procs;
-	time_t submit;
-	time_t start;
-	time_t end;
-	char *node_list;
-	char *account;
-};
-
-typedef struct job_record_ * job_record;
-
-/* Create a job record */
-job_record job_record_create(uint32_t job_id, uint32_t user_id, char *job_name,
-	char *job_state, char *partition, 
-	uint32_t limit, time_t start, time_t end, time_t submit, 
-	uint16_t batch_flag, char *node_list, uint32_t num_procs, 
-	char *account);
-
-/* Destroy a job record */
-void job_record_destroy(void *j);
-
-#endif /* _JOB_RECORD_H */ 
diff --git a/src/plugins/jobcomp/script/jobcomp_script.c b/src/plugins/jobcomp/script/jobcomp_script.c
index 1afb8215da9..d2ec5363527 100644
--- a/src/plugins/jobcomp/script/jobcomp_script.c
+++ b/src/plugins/jobcomp/script/jobcomp_script.c
@@ -47,12 +47,13 @@
 #  include <inttypes.h>
 #endif
 
-#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdarg.h>
 #include <paths.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <fcntl.h>
 #include <sys/wait.h>
 #include <unistd.h>
 #include <pthread.h>
@@ -65,7 +66,6 @@
 #include "src/common/xstring.h"
 #include "src/common/list.h"
 #include "src/slurmctld/slurmctld.h"
-#include "job_record.h"
 
 /*
  * These variables are required by the generic plugin interface.  If they
@@ -100,9 +100,7 @@ const char plugin_name[]       	= "Job completion logging script plugin";
 const char plugin_type[]       	= "jobcomp/script";
 const uint32_t plugin_version	= 90;
 
-static int plugin_errno = SLURM_SUCCESS;
 static char * script = NULL;
-static char error_str[256];
 static List comp_list = NULL;
 
 static pthread_t script_thread = 0;
@@ -111,222 +109,343 @@ static pthread_mutex_t comp_list_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t comp_list_cond = PTHREAD_COND_INITIALIZER;
 static int agent_exit = 0;
 
+/*
+ *  Local plugin errno
+ */
+static int plugin_errno = SLURM_SUCCESS;
+
+static struct jobcomp_errno {
+	int n;
+	const char *descr;
+} errno_table [] = {
+	{ 0,      "No Error"              },
+	{ EACCES, "Script access denied"  },
+	{ EEXIST, "Script does not exist" },
+	{ EINVAL, "JocCompLoc invalid"    },
+	{ -1,     "Unknown Error"         }
+};
+
+/*
+ *  Return string representation of plugin errno
+ */
+static const char * jobcomp_script_strerror (int errnum)
+{
+	struct jobcomp_errno *ep = errno_table;
+
+	while ((ep->n != errnum) && (ep->n != -1))
+		ep++;
+
+	return (ep->descr);
+}
+
+/*
+ *  Structure for holding job completion information for later
+ *   use by script;
+ */
+struct jobcomp_info {
+	uint32_t jobid;
+	uint32_t uid;
+	uint32_t limit;
+	uint32_t nprocs;
+	uint16_t batch_flag;
+	time_t submit;
+	time_t start;
+	time_t end;
+	char *nodes;
+	char *name;
+	char *partition;
+	char *jobstate;
+	char *account;
+};
+
+struct jobcomp_info * jobcomp_info_create (struct job_record *job)
+{
+	enum job_states state;
+	struct jobcomp_info * j = xmalloc (sizeof (*j));
+
+	j->jobid = job->job_id;
+	j->uid = job->user_id;
+	j->name = xstrdup (job->name);
+
+	/*
+	 *  Job will typically be COMPLETING when this code is called.
+	 *  We remove the COMPLETING flag to hopefully get the evenual
+	 *  completion state: e.g.: JOB_FAILED, TIMEOUT, ....
+	 */
+	state = job->job_state & (~JOB_COMPLETING);
+	j->jobstate = xstrdup (job_state_string (state));
+
+	j->partition = xstrdup (job->partition);
+	j->limit = job->time_limit;
+	j->start = job->start_time;
+	j->end = job->end_time;
+	j->submit = job->details ? job->details->submit_time:job->start_time;;
+	j->batch_flag = job->batch_flag;
+	j->nodes = xstrdup (job->nodes);
+	j->nprocs = job->num_procs;
+	j->account = job->account ? xstrdup (job->account) : NULL;
+
+	return (j);
+}
+
+void jobcomp_info_destroy (struct jobcomp_info *j)
+{
+	if (j == NULL)
+		return;
+	xfree (j->name);
+	xfree (j->partition);
+	xfree (j->nodes);
+	xfree (j->jobstate);
+	xfree (j->account);
+	xfree (j);
+}
+
 /*
  * Check if the script exists and if we can execute it.
  */
 static int 
-_check_script_permissions(char * location)
+_check_script_permissions(char * path)
 {
-	struct stat filestat;
-	uid_t user;
-	gid_t group;
+	struct stat st;
 
-	if(stat(location,&filestat) < 0) {
+	if (stat(path, &st) < 0) {
 		plugin_errno = errno;
-		return error("jobcomp/script script does not exist");
+		return error("jobcomp/script: %s does not exist", path);
 	}
 
-	if(!(filestat.st_mode&S_IFREG)) {
+	if (!(st.st_mode & S_IFREG)) {
 		plugin_errno = EACCES;
-		return error("jobcomp/script script isn't a regular file");
+		return error("jobcomp/script: %s isn't a regular file", path);
 	}
-	
-	user = geteuid();
-	group = getegid();
-	if(user == filestat.st_uid) {
-		if (filestat.st_mode&S_IXUSR) {
-			return SLURM_SUCCESS;
-		} else {
-			plugin_errno = EACCES;
-			return error("jobcomp/script script is not executable");
-		}
-	} else if(group == filestat.st_gid) {
-		if (filestat.st_mode&S_IXGRP) {
-			return SLURM_SUCCESS;
-		} else {
-			plugin_errno = EACCES;
-			return error("jobcomp/script script is not executable");
-		}
-	} else if(filestat.st_mode&S_IXOTH) {
-		return SLURM_SUCCESS;
+
+	if (access(path, X_OK) < 0) {
+		plugin_errno = EACCES;
+		return error("jobcomp/script: %s is not executable", path);
 	}
 
-	plugin_errno = EACCES;
-	return error("jobcomp/script script is not executable");
+	return SLURM_SUCCESS;
 }
 
-/* Write "name=value" to dest, returns pointer to after that key-pair string, 
- * where the next key-pair (if any) should be written. */
-static char *_add_env(char *dest, char *name, char *value)
+static char ** _extend_env (char ***envp)
 {
-	int len;
+	char **ep;
+	size_t newcnt = (xsize (*envp) / sizeof (char *)) + 1;
+
+	*envp = xrealloc (*envp, newcnt * sizeof (char *));
 
-	len = sprintf(dest, "%s=%s", name, value);
-	return dest + len + 1;
+	(*envp)[newcnt - 1] = NULL;
+	ep = &((*envp)[newcnt - 2]);
+
+	/* 
+	 *  Find last non-NULL entry
+	 */
+	while (*ep == NULL)
+		--ep;
+
+	return (++ep);
 }
 
-/* Create a new environment pointer containing information from 
- * slurm_jobcomp_log_record so that the script can access it.
- */
-static char ** _create_environment(char *job, char *user, char *job_name,
-	char *job_state, char *partition, char *limit, 
-	char* start, char * end, char * submit, 
-	char * batch, char *node_list, char *procs, char *account) 
+static int _env_append (char ***envp, const char *name, const char *val)
 {
-	int len = 0;
-	char ** envptr;
-
-	/* Increase ENV_COUNT if new environment variables are added */
-#define ENV_COUNT 15
-
-	/* Add strlen() of any sizable fields added to the array */
-	len = strlen(job_name) + strlen(partition) + strlen(node_list) +
-		strlen(account) + strlen(_PATH_STDPATH) + 1024;
-
-	if (!(envptr = (char **)try_xmalloc(len)))
-		return NULL;
-	envptr[0]  = (char *)envptr + (ENV_COUNT * sizeof(char *));
-
-/*	NEXT_DEST  =  ADD_ENV(DEST,        NAME=VALUE) */
-	envptr[1]  = _add_env(envptr[0],  "JOBID", job);
-	envptr[2]  = _add_env(envptr[1],  "UID", user);
-	envptr[3]  = _add_env(envptr[2],  "JOBNAME", job_name);
-	envptr[4]  = _add_env(envptr[3],  "JOBSTATE", job_state);
-	envptr[5]  = _add_env(envptr[4],  "PARTITION", partition);
-	envptr[6]  = _add_env(envptr[5],  "LIMIT", limit);
-	envptr[7]  = _add_env(envptr[6],  "START", start);
-	envptr[8]  = _add_env(envptr[7],  "END", end);
-	envptr[9]  = _add_env(envptr[8],  "NODES", node_list);
-	envptr[10] = _add_env(envptr[9],  "SUBMIT", submit);
-	envptr[11] = _add_env(envptr[10], "BATCH", batch);
-	envptr[12] = _add_env(envptr[11], "PROCS", procs);
-	envptr[13] = _add_env(envptr[12], "ACCOUNT", account);
+	char buf[4096];
+	char *entry;
+	char **ep;
+
+	if (val == NULL)
+		val = "";
+
+	snprintf (buf, sizeof (buf) - 1, "%s=%s", name, val);
+
+	if (!(entry = strdup (buf)))
+		return (-1);
+
+	ep = _extend_env (envp);
+	*ep = entry;
+
+	return (0);
+}
+
+
+static int _env_append_fmt (char ***envp, const char *name, 
+		const char *fmt, ...)
+{
+	char val[1024];
+	va_list ap;
+
+	va_start (ap, fmt);
+	vsnprintf (val, sizeof (val) - 1, fmt, ap);
+	va_end (ap);
+
+	return (_env_append (envp, name, val));
+}
+
+static char ** _create_environment (struct jobcomp_info *job)
+{
+	char **env;
+
+	env = xmalloc (1 * sizeof (*env));
+	env[0] = NULL;
+
+	_env_append_fmt (&env, "JOBID", "%u",  job->jobid);
+	_env_append_fmt (&env, "UID",   "%u",  job->uid);
+	_env_append_fmt (&env, "START", "%lu", job->start);
+	_env_append_fmt (&env, "END",   "%lu", job->end);
+	_env_append_fmt (&env, "SUBMIT","%lu", job->submit);
+	_env_append_fmt (&env, "PROCS", "%u", job->nprocs);
+
+	_env_append (&env, "BATCH", (job->batch_flag ? "yes" : "no"));
+	_env_append (&env, "NODES",     job->nodes);
+	_env_append (&env, "ACCOUNT",   job->account);
+	_env_append (&env, "JOBNAME",   job->name);
+	_env_append (&env, "JOBSTATE",  job->jobstate);
+	_env_append (&env, "PARTITION", job->partition);
+	
+	if (job->limit == INFINITE)
+		_env_append (&env, "LIMIT", "UNLIMITED");
+	else 
+		_env_append_fmt (&env, "LIMIT", "%u", job->limit);
 
 #ifdef _PATH_STDPATH
-	_add_env(envptr[13], "PATH", _PATH_STDPATH);
+	_env_append (&env, "PATH", _PATH_STDPATH);
 #else
-	envptr[13] = NULL;
+	_env_append (&env, "PATH", "/bin:/usr/bin");
 #endif
-	/* envptr[14] through envptr[ENV_COUNT] are NULL from xmalloc.
-	 * Since last entry must be NULL, add new records between 
-	 * "ACCOUNT" and "PATH" above. */
-	
-	return envptr;
+	return (env);
+}
+
+static int redirect_stdio (void)
+{
+	int devnull;
+	if ((devnull = open ("/dev/null", O_RDWR)) < 0)
+		return error ("jobcomp/script: Failed to open /dev/null: %m");
+	if (dup2 (devnull, STDIN_FILENO) < 0)
+		return error ("jobcomp/script: Failed to redirect stdin: %m");
+	if (dup2 (devnull, STDOUT_FILENO) < 0)
+		return error ("jobcomp/script: Failed to redirect stdout: %m");
+	if (dup2 (devnull, STDERR_FILENO) < 0)
+		return error ("jobcomp/script: Failed to redirect stderr: %m");
+	close (devnull);
+	return (0);
+}
+
+static void jobcomp_child (char * script, char **env)
+{
+	char * args[] = {script, NULL};
+	const char *tmpdir;
+
+#ifdef _PATH_TMP
+	tmpdir = _PATH_TMP;
+#else
+	tmpdir = "/tmp";
+#endif
+
+	/*
+	 * Reinitialize log so we can log any errors for 
+	 *  diagnosis
+	 */
+	log_reinit ();
+
+	if (redirect_stdio () < 0)
+		exit (1);
+
+	if (chdir (tmpdir) != 0) {
+		error ("jobcomp/script: chdir (%s): %m", _PATH_TMP);
+		exit(1);
+	}
+
+	execve(script, args, env);
+
+	/*
+	 * Failure of execve implies error
+	 */
+	error ("jobcomp/script: execve(%s): %m\n", script);
+	exit (1);
 }
 
+static int jobcomp_exec_child (char *script, char **env)
+{
+	pid_t pid;
+	int status = 0;
+
+	if (script == NULL || env == NULL)
+		return (-1);
+
+	if ((pid = fork()) < 0) {
+		error ("jobcomp/script: fork: %m");
+		return (-1);
+	}
+
+	if (pid == 0)
+		jobcomp_child (script, env);
+
+	/*
+	 *  Parent continues
+	 */
+
+	if (waitpid(pid, &status, 0) < 0)
+		error ("jobcomp/script: waitpid: %m");
+
+	if (WEXITSTATUS(status)) 
+		error ("jobcomp/script: script %s exited with status %d\n",
+		       script, WEXITSTATUS(status));
+
+	return (0);
+}
+
+
 /*
  * Thread function that executes a script
  */
-void *script_agent (void *args) {
-	pid_t pid = -1;
-	char user_id_str[32],job_id_str[32], nodes_cache[1];
-	char start_str[32], end_str[32], lim_str[32];
-	char submit_str[32], procs_str[32], account_str[128];
-	char * argvp[] = {script, NULL};
-	int status;
-	char ** envp, *nodes, *batch_str;
-	job_record job;
-
-	while(1) {
+void * script_agent (void *args) 
+{
+	info ("jobcomp_script_strerror (137) = %s\n", 
+			jobcomp_script_strerror (137));
+
+	while (1) {
+		struct jobcomp_info *job;
+
 		pthread_mutex_lock(&comp_list_mutex);
-		while ((list_is_empty(comp_list) != 0) && (agent_exit == 0)) {
-			pthread_cond_wait(&comp_list_cond, &comp_list_mutex);
-		}
-		if (agent_exit) {
-			pthread_mutex_unlock(&comp_list_mutex);
-			return NULL;
-		}
-		job = (job_record)list_pop(comp_list);
-		pthread_mutex_unlock(&comp_list_mutex);
 
-		snprintf(user_id_str,sizeof(user_id_str),"%u",job->user_id);
-		snprintf(job_id_str,sizeof(job_id_str),"%u",job->job_id);
-		snprintf(start_str, sizeof(start_str),"%lu",
-			(unsigned long)job->start);
-		snprintf(end_str, sizeof(end_str),"%lu",
-			(unsigned long)job->end);
-		snprintf(submit_str, sizeof(submit_str),"%lu",
-			(unsigned long)job->submit);
-		if (job->batch_flag)
-			batch_str = "yes";
-		else
-			batch_str = "no"; 
-		nodes_cache[0] = '\0';
-
-		if (job->limit == INFINITE) {
-			strcpy(lim_str, "UNLIMITED");
-		} else {
-			snprintf(lim_str, sizeof(lim_str), "%lu", 
-				(unsigned long) job->limit);
-		}
-	
-		if (job->node_list == NULL) {
-			nodes = nodes_cache;
-		} else {
-			nodes = job->node_list;
-		}
+		if (list_is_empty(comp_list) && !agent_exit)
+			pthread_cond_wait(&comp_list_cond, &comp_list_mutex);
 
-		snprintf(procs_str, sizeof(procs_str), "%lu",
-				(unsigned long) job->num_procs);
-		if (job->account) {
-			snprintf(account_str, sizeof(account_str), "%s",
-				job->account);
-		} else
-			account_str[0] = '\0';
-
-		/* Setup environment*/
-		envp = _create_environment(job_id_str,user_id_str,
-					job->job_name, job->job_state,
-					job->partition, lim_str,
-					start_str, end_str, submit_str,
-					batch_str, nodes, procs_str, 
-					account_str);
-
-		if(envp == NULL) {
-			plugin_errno = ENOMEM;
-		}
+		/*
+		 * It is safe to unlock list mutex here. List has its
+		 *  own internal mutex that protects the comp_list itself
+		 */
+		pthread_mutex_unlock(&comp_list_mutex);
 
-		pid = fork();
+		if ((job = list_pop(comp_list))) {
+			char **envp;
 
-		if (pid < 0) {
-			xfree(envp);
-			job_record_destroy((void *)job);
-			plugin_errno = errno;
-		} else if (pid == 0) {
-			/*Change directory to tmp*/
-			if (chdir(_PATH_TMP) != 0) {
-				exit(errno);
+			if ((envp = _create_environment (job)) == NULL) {
+				error ("jobcomp/script: create env failed!");
+				jobcomp_info_destroy (job);
+				continue;
 			}
 
-			/*Redirect stdin, stderr, and stdout to /dev/null*/
-			if (freopen(_PATH_DEVNULL, "rb", stdin) == NULL) {
-				exit(errno);
-			}
-			if (freopen(_PATH_DEVNULL, "wb", stdout) == NULL) {
-				exit(errno);
-			}
-			if (freopen(_PATH_DEVNULL, "wb", stderr) == NULL) {
-				exit(errno);
-			}
-		
-			/*Exec Script*/
-			execve(script,argvp,envp);
-		} else {
+			if (jobcomp_exec_child (script, envp) < 0)
+				error ("jobcomp/script: %s failed");
+
 			xfree(envp);
-			job_record_destroy((void *)job);
-			waitpid(pid,&status,0);
-			if(WIFEXITED(status) && WEXITSTATUS(status)) {
-				plugin_errno = WEXITSTATUS(status);
-			}
+			jobcomp_info_destroy (job);
 		}
+
+		/*
+		 *  Exit if flag is set and we have no more entries to log
+		 */
+		if (agent_exit && list_is_empty (comp_list))
+			break;
 	}
+
+	return NULL;
 }
 
 /*
  * init() is called when the plugin is loaded, before any other functions
  * are called.  Put global initialization here.
  */
-int init ( void )
+int init (void)
 {
 	pthread_attr_t attr;
 
@@ -334,18 +453,17 @@ int init ( void )
 
 	pthread_mutex_lock(&thread_flag_mutex);
 
-	comp_list = list_create((ListDelF) job_record_destroy);
-	if(comp_list == NULL) {
+	if (!(comp_list = list_create((ListDelF) jobcomp_info_destroy)))
 		return SLURM_ERROR;
-	}
 
 	if (script_thread) {
 		debug2( "Script thread already running, not starting another");
 		pthread_mutex_unlock(&thread_flag_mutex);
 		return SLURM_ERROR;
 	}
+
 	slurm_attr_init(&attr);
-	pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_DETACHED );
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
 	pthread_create(&script_thread, &attr, script_agent, NULL);
 	
 	pthread_mutex_unlock(&thread_flag_mutex);
@@ -355,16 +473,15 @@ int init ( void )
 }
 
 /* Set the location of the script to run*/
-int slurm_jobcomp_set_location ( char * location )
+int slurm_jobcomp_set_location (char * location)
 {
 	if (location == NULL) {
 		plugin_errno = EACCES;
 		return error("jobcomp/script JobCompLoc needs to be set");
 	}
 
-	if (_check_script_permissions(location) != SLURM_SUCCESS) {
+	if (_check_script_permissions(location) != SLURM_SUCCESS)
 		return SLURM_ERROR;
-	}
 
 	xfree(script);
 	script = xstrdup(location);
@@ -372,47 +489,34 @@ int slurm_jobcomp_set_location ( char * location )
 	return SLURM_SUCCESS;
 }
 
-int slurm_jobcomp_log_record ( struct job_record *job_ptr )
+int slurm_jobcomp_log_record (struct job_record *record)
 {
-	job_record job;
-	enum job_states job_state;
-	time_t submit;
+	struct jobcomp_info * job;
 
 	debug3("Entering slurm_jobcomp_log_record");
-	/* Job will typically be COMPLETING when this is called. 
-	 * We remove this flag to get the eventual completion state:
-	 * JOB_FAILED, JOB_TIMEOUT, etc. */
-	job_state = job_ptr->job_state & (~JOB_COMPLETING);
-
-	if (job_ptr->details)
-		submit = job_ptr->details->submit_time;
-	else
-		submit = job_ptr->start_time;
-
-	job = job_record_create(job_ptr->job_id, job_ptr->user_id, job_ptr->name,
-				job_state_string(job_state), 
-				job_ptr->partition, job_ptr->time_limit,
-				job_ptr->start_time, job_ptr->end_time,
-				submit, job_ptr->batch_flag, job_ptr->nodes,
-				job_ptr->num_procs, job_ptr->account);
+
+	if ((job = jobcomp_info_create (record))) 
+		return error ("jobcomp/script: Failed to create job info!");
+
 	pthread_mutex_lock(&comp_list_mutex);
-	list_append(comp_list,(void *)job);
+	list_append(comp_list, job);
 	pthread_mutex_unlock(&comp_list_mutex);
+
 	pthread_cond_broadcast(&comp_list_cond);
+
 	return SLURM_SUCCESS;
 }
 
 /* Return the error code of the plugin*/
-int slurm_jobcomp_get_errno( void )
+int slurm_jobcomp_get_errno(void)
 {
 	return plugin_errno;
 }
 
 /* Return a string representation of the error */
-char *slurm_jobcomp_strerror( int errnum )
+const char * slurm_jobcomp_strerror(int errnum)
 {
-	strerror_r(errnum,error_str,sizeof(error_str));
-	return error_str;
+	return jobcomp_script_strerror (errnum);
 }
 
 static int _wait_for_thread (pthread_t thread_id)
@@ -424,7 +528,13 @@ static int _wait_for_thread (pthread_t thread_id)
 			return SLURM_SUCCESS;
 		usleep(1000);
 	}
-	error("Could not kill jobcomp script pthread");
+
+	/*
+	 * jobcomp thread appears to be stuck, try harder:
+	 */
+	if (pthread_kill(thread_id, SIGKILL))
+		error("Could not kill jobcomp script pthread");
+
 	return SLURM_ERROR;
 }
 
-- 
GitLab