diff --git a/NEWS b/NEWS index a9f4cb2645b4b4c6abbb5468527ffb65b1500f48..3546b3bffccea710750a058b43bae350e7c3e7f7 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,8 @@ documents those changes that are of interest to users and admins. -- BLUEGENE - code to make it so you can make a 36x36x36 system. The wiring isn't correct on the X axis for anything over 8 though. But that is only in emulation mode. + -- Major re-write of jobcomp/script plugin: fix memory leak and + general code clean-up. * Changes in SLURM 1.2.6 ======================== diff --git a/src/plugins/jobcomp/script/Makefile.am b/src/plugins/jobcomp/script/Makefile.am index 9ad7aa904708ec2b17f0b1de29814003e6a3aefd..eae162b656f727d3a4a4fa1d27e16ebcfc37e1a2 100644 --- a/src/plugins/jobcomp/script/Makefile.am +++ b/src/plugins/jobcomp/script/Makefile.am @@ -9,5 +9,5 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common pkglib_LTLIBRARIES = jobcomp_script.la # Text file job completion logging plugin. -jobcomp_script_la_SOURCES = job_record.c job_record.h jobcomp_script.c +jobcomp_script_la_SOURCES = jobcomp_script.c jobcomp_script_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) diff --git a/src/plugins/jobcomp/script/Makefile.in b/src/plugins/jobcomp/script/Makefile.in index 871dba84763a4f6b4b1f7258d8705bf0e92c9b56..0a9a4cfbab673af6a67cfc3b4ef906892b8ae5e4 100644 --- a/src/plugins/jobcomp/script/Makefile.in +++ b/src/plugins/jobcomp/script/Makefile.in @@ -78,7 +78,7 @@ am__installdirs = "$(DESTDIR)$(pkglibdir)" pkglibLTLIBRARIES_INSTALL = $(INSTALL) LTLIBRARIES = $(pkglib_LTLIBRARIES) jobcomp_script_la_LIBADD = -am_jobcomp_script_la_OBJECTS = job_record.lo jobcomp_script.lo +am_jobcomp_script_la_OBJECTS = jobcomp_script.lo jobcomp_script_la_OBJECTS = $(am_jobcomp_script_la_OBJECTS) DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir) -I$(top_builddir)/slurm depcomp = $(SHELL) $(top_srcdir)/auxdir/depcomp @@ -285,7 +285,7 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common pkglib_LTLIBRARIES = jobcomp_script.la # Text file job completion logging plugin. -jobcomp_script_la_SOURCES = job_record.c job_record.h jobcomp_script.c +jobcomp_script_la_SOURCES = jobcomp_script.c jobcomp_script_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) all: all-am @@ -356,7 +356,6 @@ mostlyclean-compile: distclean-compile: -rm -f *.tab.c -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/job_record.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jobcomp_script.Plo@am__quote@ .c.o: diff --git a/src/plugins/jobcomp/script/job_record.c b/src/plugins/jobcomp/script/job_record.c deleted file mode 100644 index 32d987c8f2fe6cb06245849154df5d724ea3a23e..0000000000000000000000000000000000000000 --- a/src/plugins/jobcomp/script/job_record.c +++ /dev/null @@ -1,102 +0,0 @@ -/****************************************************************************** - * job_record.c - Functions to alter script plugin job record structure. - ****************************************************************************** - * Produced at Center for High Performance Computing, North Dakota State - * University - * Written by Nathan Huff <nhuff@acm.org> - * UCRL-CODE-226842. - * - * This file is part of SLURM, a resource management program. - * For details, see <http://www.llnl.gov/linux/slurm/>. - * - * SLURM is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under - * certain conditions as described in each individual source file, and - * distribute linked combinations including the two. You must obey the GNU - * General Public License in all respects for all of the code used other than - * OpenSSL. If you modify file(s) with this exception, you may extend this - * exception to your version of the file(s), but you are not obligated to do - * so. If you do not wish to do so, delete this exception statement from your - * version. If you delete this exception statement from all source files in - * the program, then also delete it here. - * - * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along - * with SLURM; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -\*****************************************************************************/ - -#ifdef HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_STDINT_H -# include <stdint.h> -#endif -#if HAVE_INTTYPES_H -# include <inttypes.h> -#endif - - -#include <unistd.h> -#include <sys/types.h> -#include <slurm/slurm.h> -#include "job_record.h" -#include "src/common/xmalloc.h" -#include "src/common/xstring.h" - -/* - * Create a new job_record containing the job completion information - */ -job_record job_record_create(uint32_t job_id, uint32_t user_id, char *job_name, - char *job_state, char *partition, - uint32_t limit, time_t start, time_t end, time_t submit, - uint16_t batch_flag, char *node_list, uint32_t num_procs, - char *account) -{ - job_record ret; - - ret = xmalloc(sizeof(struct job_record_)); - ret->job_id = job_id; - ret->user_id = user_id; - ret->job_name = xstrdup(job_name); - ret->job_state = xstrdup(job_state); - ret->partition = xstrdup(partition); - ret->limit = limit; - ret->start = start; - ret->submit = submit; - ret->batch_flag = batch_flag; - ret->end = end; - ret->node_list = xstrdup(node_list); - ret->num_procs = num_procs; - if (account) - ret->account = xstrdup(account); - - return ret; -} - -/* - * Free the memory from a job_record structure - */ -void job_record_destroy(void *job) { - job_record j = job; - - if (j == NULL) - return; - - xfree(j->job_name); - xfree(j->partition); - xfree(j->node_list); - xfree(j->job_state); - xfree(j->account); - xfree(j); -} diff --git a/src/plugins/jobcomp/script/job_record.h b/src/plugins/jobcomp/script/job_record.h deleted file mode 100644 index cfd762739e5acb0bd11a910f1b5875dcbcb4d416..0000000000000000000000000000000000000000 --- a/src/plugins/jobcomp/script/job_record.h +++ /dev/null @@ -1,70 +0,0 @@ -/*****************************************************************************\ - * job_record.h - Header for job_record structure. - ***************************************************************************** - * Produced at Center for High Performance Computing, North Dakota State - * University - * Written by Nathan Huff <nhuff@geekshanty.com> - * - * This file is part of SLURM, a resource management program. - * For details, see <http://www.llnl.gov/linux/slurm/>. - * - * SLURM is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under - * certain conditions as described in each individual source file, and - * distribute linked combinations including the two. You must obey the GNU - * General Public License in all respects for all of the code used other than - * OpenSSL. If you modify file(s) with this exception, you may extend this - * exception to your version of the file(s), but you are not obligated to do - * so. If you do not wish to do so, delete this exception statement from your - * version. If you delete this exception statement from all source files in - * the program, then also delete it here. - * - * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along - * with SLURM; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -\*****************************************************************************/ - - -#ifndef _JOB_RECORD_H -#define _JOB_RECORD_H - -/* Structure to contain job completion information */ -struct job_record_ { - uint32_t job_id; - uint32_t user_id; - char *job_name; - char *job_state; - char *partition; - uint32_t limit; - uint16_t batch_flag; - uint32_t num_procs; - time_t submit; - time_t start; - time_t end; - char *node_list; - char *account; -}; - -typedef struct job_record_ * job_record; - -/* Create a job record */ -job_record job_record_create(uint32_t job_id, uint32_t user_id, char *job_name, - char *job_state, char *partition, - uint32_t limit, time_t start, time_t end, time_t submit, - uint16_t batch_flag, char *node_list, uint32_t num_procs, - char *account); - -/* Destroy a job record */ -void job_record_destroy(void *j); - -#endif /* _JOB_RECORD_H */ diff --git a/src/plugins/jobcomp/script/jobcomp_script.c b/src/plugins/jobcomp/script/jobcomp_script.c index 1afb8215da9789e52bbaa84578dd98841d69b9f5..d2ec536352780c91e13fe8fae9689488185ad5eb 100644 --- a/src/plugins/jobcomp/script/jobcomp_script.c +++ b/src/plugins/jobcomp/script/jobcomp_script.c @@ -47,12 +47,13 @@ # include <inttypes.h> #endif -#include <stdio.h> #include <stdlib.h> #include <string.h> +#include <stdarg.h> #include <paths.h> #include <sys/types.h> #include <sys/stat.h> +#include <fcntl.h> #include <sys/wait.h> #include <unistd.h> #include <pthread.h> @@ -65,7 +66,6 @@ #include "src/common/xstring.h" #include "src/common/list.h" #include "src/slurmctld/slurmctld.h" -#include "job_record.h" /* * These variables are required by the generic plugin interface. If they @@ -100,9 +100,7 @@ const char plugin_name[] = "Job completion logging script plugin"; const char plugin_type[] = "jobcomp/script"; const uint32_t plugin_version = 90; -static int plugin_errno = SLURM_SUCCESS; static char * script = NULL; -static char error_str[256]; static List comp_list = NULL; static pthread_t script_thread = 0; @@ -111,222 +109,343 @@ static pthread_mutex_t comp_list_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t comp_list_cond = PTHREAD_COND_INITIALIZER; static int agent_exit = 0; +/* + * Local plugin errno + */ +static int plugin_errno = SLURM_SUCCESS; + +static struct jobcomp_errno { + int n; + const char *descr; +} errno_table [] = { + { 0, "No Error" }, + { EACCES, "Script access denied" }, + { EEXIST, "Script does not exist" }, + { EINVAL, "JocCompLoc invalid" }, + { -1, "Unknown Error" } +}; + +/* + * Return string representation of plugin errno + */ +static const char * jobcomp_script_strerror (int errnum) +{ + struct jobcomp_errno *ep = errno_table; + + while ((ep->n != errnum) && (ep->n != -1)) + ep++; + + return (ep->descr); +} + +/* + * Structure for holding job completion information for later + * use by script; + */ +struct jobcomp_info { + uint32_t jobid; + uint32_t uid; + uint32_t limit; + uint32_t nprocs; + uint16_t batch_flag; + time_t submit; + time_t start; + time_t end; + char *nodes; + char *name; + char *partition; + char *jobstate; + char *account; +}; + +struct jobcomp_info * jobcomp_info_create (struct job_record *job) +{ + enum job_states state; + struct jobcomp_info * j = xmalloc (sizeof (*j)); + + j->jobid = job->job_id; + j->uid = job->user_id; + j->name = xstrdup (job->name); + + /* + * Job will typically be COMPLETING when this code is called. + * We remove the COMPLETING flag to hopefully get the evenual + * completion state: e.g.: JOB_FAILED, TIMEOUT, .... + */ + state = job->job_state & (~JOB_COMPLETING); + j->jobstate = xstrdup (job_state_string (state)); + + j->partition = xstrdup (job->partition); + j->limit = job->time_limit; + j->start = job->start_time; + j->end = job->end_time; + j->submit = job->details ? job->details->submit_time:job->start_time;; + j->batch_flag = job->batch_flag; + j->nodes = xstrdup (job->nodes); + j->nprocs = job->num_procs; + j->account = job->account ? xstrdup (job->account) : NULL; + + return (j); +} + +void jobcomp_info_destroy (struct jobcomp_info *j) +{ + if (j == NULL) + return; + xfree (j->name); + xfree (j->partition); + xfree (j->nodes); + xfree (j->jobstate); + xfree (j->account); + xfree (j); +} + /* * Check if the script exists and if we can execute it. */ static int -_check_script_permissions(char * location) +_check_script_permissions(char * path) { - struct stat filestat; - uid_t user; - gid_t group; + struct stat st; - if(stat(location,&filestat) < 0) { + if (stat(path, &st) < 0) { plugin_errno = errno; - return error("jobcomp/script script does not exist"); + return error("jobcomp/script: %s does not exist", path); } - if(!(filestat.st_mode&S_IFREG)) { + if (!(st.st_mode & S_IFREG)) { plugin_errno = EACCES; - return error("jobcomp/script script isn't a regular file"); + return error("jobcomp/script: %s isn't a regular file", path); } - - user = geteuid(); - group = getegid(); - if(user == filestat.st_uid) { - if (filestat.st_mode&S_IXUSR) { - return SLURM_SUCCESS; - } else { - plugin_errno = EACCES; - return error("jobcomp/script script is not executable"); - } - } else if(group == filestat.st_gid) { - if (filestat.st_mode&S_IXGRP) { - return SLURM_SUCCESS; - } else { - plugin_errno = EACCES; - return error("jobcomp/script script is not executable"); - } - } else if(filestat.st_mode&S_IXOTH) { - return SLURM_SUCCESS; + + if (access(path, X_OK) < 0) { + plugin_errno = EACCES; + return error("jobcomp/script: %s is not executable", path); } - plugin_errno = EACCES; - return error("jobcomp/script script is not executable"); + return SLURM_SUCCESS; } -/* Write "name=value" to dest, returns pointer to after that key-pair string, - * where the next key-pair (if any) should be written. */ -static char *_add_env(char *dest, char *name, char *value) +static char ** _extend_env (char ***envp) { - int len; + char **ep; + size_t newcnt = (xsize (*envp) / sizeof (char *)) + 1; + + *envp = xrealloc (*envp, newcnt * sizeof (char *)); - len = sprintf(dest, "%s=%s", name, value); - return dest + len + 1; + (*envp)[newcnt - 1] = NULL; + ep = &((*envp)[newcnt - 2]); + + /* + * Find last non-NULL entry + */ + while (*ep == NULL) + --ep; + + return (++ep); } -/* Create a new environment pointer containing information from - * slurm_jobcomp_log_record so that the script can access it. - */ -static char ** _create_environment(char *job, char *user, char *job_name, - char *job_state, char *partition, char *limit, - char* start, char * end, char * submit, - char * batch, char *node_list, char *procs, char *account) +static int _env_append (char ***envp, const char *name, const char *val) { - int len = 0; - char ** envptr; - - /* Increase ENV_COUNT if new environment variables are added */ -#define ENV_COUNT 15 - - /* Add strlen() of any sizable fields added to the array */ - len = strlen(job_name) + strlen(partition) + strlen(node_list) + - strlen(account) + strlen(_PATH_STDPATH) + 1024; - - if (!(envptr = (char **)try_xmalloc(len))) - return NULL; - envptr[0] = (char *)envptr + (ENV_COUNT * sizeof(char *)); - -/* NEXT_DEST = ADD_ENV(DEST, NAME=VALUE) */ - envptr[1] = _add_env(envptr[0], "JOBID", job); - envptr[2] = _add_env(envptr[1], "UID", user); - envptr[3] = _add_env(envptr[2], "JOBNAME", job_name); - envptr[4] = _add_env(envptr[3], "JOBSTATE", job_state); - envptr[5] = _add_env(envptr[4], "PARTITION", partition); - envptr[6] = _add_env(envptr[5], "LIMIT", limit); - envptr[7] = _add_env(envptr[6], "START", start); - envptr[8] = _add_env(envptr[7], "END", end); - envptr[9] = _add_env(envptr[8], "NODES", node_list); - envptr[10] = _add_env(envptr[9], "SUBMIT", submit); - envptr[11] = _add_env(envptr[10], "BATCH", batch); - envptr[12] = _add_env(envptr[11], "PROCS", procs); - envptr[13] = _add_env(envptr[12], "ACCOUNT", account); + char buf[4096]; + char *entry; + char **ep; + + if (val == NULL) + val = ""; + + snprintf (buf, sizeof (buf) - 1, "%s=%s", name, val); + + if (!(entry = strdup (buf))) + return (-1); + + ep = _extend_env (envp); + *ep = entry; + + return (0); +} + + +static int _env_append_fmt (char ***envp, const char *name, + const char *fmt, ...) +{ + char val[1024]; + va_list ap; + + va_start (ap, fmt); + vsnprintf (val, sizeof (val) - 1, fmt, ap); + va_end (ap); + + return (_env_append (envp, name, val)); +} + +static char ** _create_environment (struct jobcomp_info *job) +{ + char **env; + + env = xmalloc (1 * sizeof (*env)); + env[0] = NULL; + + _env_append_fmt (&env, "JOBID", "%u", job->jobid); + _env_append_fmt (&env, "UID", "%u", job->uid); + _env_append_fmt (&env, "START", "%lu", job->start); + _env_append_fmt (&env, "END", "%lu", job->end); + _env_append_fmt (&env, "SUBMIT","%lu", job->submit); + _env_append_fmt (&env, "PROCS", "%u", job->nprocs); + + _env_append (&env, "BATCH", (job->batch_flag ? "yes" : "no")); + _env_append (&env, "NODES", job->nodes); + _env_append (&env, "ACCOUNT", job->account); + _env_append (&env, "JOBNAME", job->name); + _env_append (&env, "JOBSTATE", job->jobstate); + _env_append (&env, "PARTITION", job->partition); + + if (job->limit == INFINITE) + _env_append (&env, "LIMIT", "UNLIMITED"); + else + _env_append_fmt (&env, "LIMIT", "%u", job->limit); #ifdef _PATH_STDPATH - _add_env(envptr[13], "PATH", _PATH_STDPATH); + _env_append (&env, "PATH", _PATH_STDPATH); #else - envptr[13] = NULL; + _env_append (&env, "PATH", "/bin:/usr/bin"); #endif - /* envptr[14] through envptr[ENV_COUNT] are NULL from xmalloc. - * Since last entry must be NULL, add new records between - * "ACCOUNT" and "PATH" above. */ - - return envptr; + return (env); +} + +static int redirect_stdio (void) +{ + int devnull; + if ((devnull = open ("/dev/null", O_RDWR)) < 0) + return error ("jobcomp/script: Failed to open /dev/null: %m"); + if (dup2 (devnull, STDIN_FILENO) < 0) + return error ("jobcomp/script: Failed to redirect stdin: %m"); + if (dup2 (devnull, STDOUT_FILENO) < 0) + return error ("jobcomp/script: Failed to redirect stdout: %m"); + if (dup2 (devnull, STDERR_FILENO) < 0) + return error ("jobcomp/script: Failed to redirect stderr: %m"); + close (devnull); + return (0); +} + +static void jobcomp_child (char * script, char **env) +{ + char * args[] = {script, NULL}; + const char *tmpdir; + +#ifdef _PATH_TMP + tmpdir = _PATH_TMP; +#else + tmpdir = "/tmp"; +#endif + + /* + * Reinitialize log so we can log any errors for + * diagnosis + */ + log_reinit (); + + if (redirect_stdio () < 0) + exit (1); + + if (chdir (tmpdir) != 0) { + error ("jobcomp/script: chdir (%s): %m", _PATH_TMP); + exit(1); + } + + execve(script, args, env); + + /* + * Failure of execve implies error + */ + error ("jobcomp/script: execve(%s): %m\n", script); + exit (1); } +static int jobcomp_exec_child (char *script, char **env) +{ + pid_t pid; + int status = 0; + + if (script == NULL || env == NULL) + return (-1); + + if ((pid = fork()) < 0) { + error ("jobcomp/script: fork: %m"); + return (-1); + } + + if (pid == 0) + jobcomp_child (script, env); + + /* + * Parent continues + */ + + if (waitpid(pid, &status, 0) < 0) + error ("jobcomp/script: waitpid: %m"); + + if (WEXITSTATUS(status)) + error ("jobcomp/script: script %s exited with status %d\n", + script, WEXITSTATUS(status)); + + return (0); +} + + /* * Thread function that executes a script */ -void *script_agent (void *args) { - pid_t pid = -1; - char user_id_str[32],job_id_str[32], nodes_cache[1]; - char start_str[32], end_str[32], lim_str[32]; - char submit_str[32], procs_str[32], account_str[128]; - char * argvp[] = {script, NULL}; - int status; - char ** envp, *nodes, *batch_str; - job_record job; - - while(1) { +void * script_agent (void *args) +{ + info ("jobcomp_script_strerror (137) = %s\n", + jobcomp_script_strerror (137)); + + while (1) { + struct jobcomp_info *job; + pthread_mutex_lock(&comp_list_mutex); - while ((list_is_empty(comp_list) != 0) && (agent_exit == 0)) { - pthread_cond_wait(&comp_list_cond, &comp_list_mutex); - } - if (agent_exit) { - pthread_mutex_unlock(&comp_list_mutex); - return NULL; - } - job = (job_record)list_pop(comp_list); - pthread_mutex_unlock(&comp_list_mutex); - snprintf(user_id_str,sizeof(user_id_str),"%u",job->user_id); - snprintf(job_id_str,sizeof(job_id_str),"%u",job->job_id); - snprintf(start_str, sizeof(start_str),"%lu", - (unsigned long)job->start); - snprintf(end_str, sizeof(end_str),"%lu", - (unsigned long)job->end); - snprintf(submit_str, sizeof(submit_str),"%lu", - (unsigned long)job->submit); - if (job->batch_flag) - batch_str = "yes"; - else - batch_str = "no"; - nodes_cache[0] = '\0'; - - if (job->limit == INFINITE) { - strcpy(lim_str, "UNLIMITED"); - } else { - snprintf(lim_str, sizeof(lim_str), "%lu", - (unsigned long) job->limit); - } - - if (job->node_list == NULL) { - nodes = nodes_cache; - } else { - nodes = job->node_list; - } + if (list_is_empty(comp_list) && !agent_exit) + pthread_cond_wait(&comp_list_cond, &comp_list_mutex); - snprintf(procs_str, sizeof(procs_str), "%lu", - (unsigned long) job->num_procs); - if (job->account) { - snprintf(account_str, sizeof(account_str), "%s", - job->account); - } else - account_str[0] = '\0'; - - /* Setup environment*/ - envp = _create_environment(job_id_str,user_id_str, - job->job_name, job->job_state, - job->partition, lim_str, - start_str, end_str, submit_str, - batch_str, nodes, procs_str, - account_str); - - if(envp == NULL) { - plugin_errno = ENOMEM; - } + /* + * It is safe to unlock list mutex here. List has its + * own internal mutex that protects the comp_list itself + */ + pthread_mutex_unlock(&comp_list_mutex); - pid = fork(); + if ((job = list_pop(comp_list))) { + char **envp; - if (pid < 0) { - xfree(envp); - job_record_destroy((void *)job); - plugin_errno = errno; - } else if (pid == 0) { - /*Change directory to tmp*/ - if (chdir(_PATH_TMP) != 0) { - exit(errno); + if ((envp = _create_environment (job)) == NULL) { + error ("jobcomp/script: create env failed!"); + jobcomp_info_destroy (job); + continue; } - /*Redirect stdin, stderr, and stdout to /dev/null*/ - if (freopen(_PATH_DEVNULL, "rb", stdin) == NULL) { - exit(errno); - } - if (freopen(_PATH_DEVNULL, "wb", stdout) == NULL) { - exit(errno); - } - if (freopen(_PATH_DEVNULL, "wb", stderr) == NULL) { - exit(errno); - } - - /*Exec Script*/ - execve(script,argvp,envp); - } else { + if (jobcomp_exec_child (script, envp) < 0) + error ("jobcomp/script: %s failed"); + xfree(envp); - job_record_destroy((void *)job); - waitpid(pid,&status,0); - if(WIFEXITED(status) && WEXITSTATUS(status)) { - plugin_errno = WEXITSTATUS(status); - } + jobcomp_info_destroy (job); } + + /* + * Exit if flag is set and we have no more entries to log + */ + if (agent_exit && list_is_empty (comp_list)) + break; } + + return NULL; } /* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ -int init ( void ) +int init (void) { pthread_attr_t attr; @@ -334,18 +453,17 @@ int init ( void ) pthread_mutex_lock(&thread_flag_mutex); - comp_list = list_create((ListDelF) job_record_destroy); - if(comp_list == NULL) { + if (!(comp_list = list_create((ListDelF) jobcomp_info_destroy))) return SLURM_ERROR; - } if (script_thread) { debug2( "Script thread already running, not starting another"); pthread_mutex_unlock(&thread_flag_mutex); return SLURM_ERROR; } + slurm_attr_init(&attr); - pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_DETACHED ); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); pthread_create(&script_thread, &attr, script_agent, NULL); pthread_mutex_unlock(&thread_flag_mutex); @@ -355,16 +473,15 @@ int init ( void ) } /* Set the location of the script to run*/ -int slurm_jobcomp_set_location ( char * location ) +int slurm_jobcomp_set_location (char * location) { if (location == NULL) { plugin_errno = EACCES; return error("jobcomp/script JobCompLoc needs to be set"); } - if (_check_script_permissions(location) != SLURM_SUCCESS) { + if (_check_script_permissions(location) != SLURM_SUCCESS) return SLURM_ERROR; - } xfree(script); script = xstrdup(location); @@ -372,47 +489,34 @@ int slurm_jobcomp_set_location ( char * location ) return SLURM_SUCCESS; } -int slurm_jobcomp_log_record ( struct job_record *job_ptr ) +int slurm_jobcomp_log_record (struct job_record *record) { - job_record job; - enum job_states job_state; - time_t submit; + struct jobcomp_info * job; debug3("Entering slurm_jobcomp_log_record"); - /* Job will typically be COMPLETING when this is called. - * We remove this flag to get the eventual completion state: - * JOB_FAILED, JOB_TIMEOUT, etc. */ - job_state = job_ptr->job_state & (~JOB_COMPLETING); - - if (job_ptr->details) - submit = job_ptr->details->submit_time; - else - submit = job_ptr->start_time; - - job = job_record_create(job_ptr->job_id, job_ptr->user_id, job_ptr->name, - job_state_string(job_state), - job_ptr->partition, job_ptr->time_limit, - job_ptr->start_time, job_ptr->end_time, - submit, job_ptr->batch_flag, job_ptr->nodes, - job_ptr->num_procs, job_ptr->account); + + if ((job = jobcomp_info_create (record))) + return error ("jobcomp/script: Failed to create job info!"); + pthread_mutex_lock(&comp_list_mutex); - list_append(comp_list,(void *)job); + list_append(comp_list, job); pthread_mutex_unlock(&comp_list_mutex); + pthread_cond_broadcast(&comp_list_cond); + return SLURM_SUCCESS; } /* Return the error code of the plugin*/ -int slurm_jobcomp_get_errno( void ) +int slurm_jobcomp_get_errno(void) { return plugin_errno; } /* Return a string representation of the error */ -char *slurm_jobcomp_strerror( int errnum ) +const char * slurm_jobcomp_strerror(int errnum) { - strerror_r(errnum,error_str,sizeof(error_str)); - return error_str; + return jobcomp_script_strerror (errnum); } static int _wait_for_thread (pthread_t thread_id) @@ -424,7 +528,13 @@ static int _wait_for_thread (pthread_t thread_id) return SLURM_SUCCESS; usleep(1000); } - error("Could not kill jobcomp script pthread"); + + /* + * jobcomp thread appears to be stuck, try harder: + */ + if (pthread_kill(thread_id, SIGKILL)) + error("Could not kill jobcomp script pthread"); + return SLURM_ERROR; }