Skip to content
Snippets Groups Projects
Commit 40cd97d6 authored by Moe Jette's avatar Moe Jette
Browse files

Major re-write of jobcomp/script plugin: fix memory leak and

    general code clean-up - Mark Grondona.
parent 790cf6b9
No related branches found
No related tags found
No related merge requests found
...@@ -6,6 +6,8 @@ documents those changes that are of interest to users and admins. ...@@ -6,6 +6,8 @@ documents those changes that are of interest to users and admins.
-- BLUEGENE - code to make it so you can make a 36x36x36 system. -- BLUEGENE - code to make it so you can make a 36x36x36 system.
The wiring isn't correct on the X axis for anything over 8 though. The wiring isn't correct on the X axis for anything over 8 though.
But that is only in emulation mode. But that is only in emulation mode.
-- Major re-write of jobcomp/script plugin: fix memory leak and
general code clean-up.
* Changes in SLURM 1.2.6 * Changes in SLURM 1.2.6
======================== ========================
......
...@@ -9,5 +9,5 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common ...@@ -9,5 +9,5 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common
pkglib_LTLIBRARIES = jobcomp_script.la pkglib_LTLIBRARIES = jobcomp_script.la
# Text file job completion logging plugin. # Text file job completion logging plugin.
jobcomp_script_la_SOURCES = job_record.c job_record.h jobcomp_script.c jobcomp_script_la_SOURCES = jobcomp_script.c
jobcomp_script_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) jobcomp_script_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS)
...@@ -78,7 +78,7 @@ am__installdirs = "$(DESTDIR)$(pkglibdir)" ...@@ -78,7 +78,7 @@ am__installdirs = "$(DESTDIR)$(pkglibdir)"
pkglibLTLIBRARIES_INSTALL = $(INSTALL) pkglibLTLIBRARIES_INSTALL = $(INSTALL)
LTLIBRARIES = $(pkglib_LTLIBRARIES) LTLIBRARIES = $(pkglib_LTLIBRARIES)
jobcomp_script_la_LIBADD = jobcomp_script_la_LIBADD =
am_jobcomp_script_la_OBJECTS = job_record.lo jobcomp_script.lo am_jobcomp_script_la_OBJECTS = jobcomp_script.lo
jobcomp_script_la_OBJECTS = $(am_jobcomp_script_la_OBJECTS) jobcomp_script_la_OBJECTS = $(am_jobcomp_script_la_OBJECTS)
DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir) -I$(top_builddir)/slurm DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir) -I$(top_builddir)/slurm
depcomp = $(SHELL) $(top_srcdir)/auxdir/depcomp depcomp = $(SHELL) $(top_srcdir)/auxdir/depcomp
...@@ -285,7 +285,7 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common ...@@ -285,7 +285,7 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common
pkglib_LTLIBRARIES = jobcomp_script.la pkglib_LTLIBRARIES = jobcomp_script.la
# Text file job completion logging plugin. # Text file job completion logging plugin.
jobcomp_script_la_SOURCES = job_record.c job_record.h jobcomp_script.c jobcomp_script_la_SOURCES = jobcomp_script.c
jobcomp_script_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) jobcomp_script_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS)
all: all-am all: all-am
...@@ -356,7 +356,6 @@ mostlyclean-compile: ...@@ -356,7 +356,6 @@ mostlyclean-compile:
distclean-compile: distclean-compile:
-rm -f *.tab.c -rm -f *.tab.c
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/job_record.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jobcomp_script.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jobcomp_script.Plo@am__quote@
.c.o: .c.o:
......
/******************************************************************************
* job_record.c - Functions to alter script plugin job record structure.
******************************************************************************
* Produced at Center for High Performance Computing, North Dakota State
* University
* Written by Nathan Huff <nhuff@acm.org>
* UCRL-CODE-226842.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#if HAVE_STDINT_H
# include <stdint.h>
#endif
#if HAVE_INTTYPES_H
# include <inttypes.h>
#endif
#include <unistd.h>
#include <sys/types.h>
#include <slurm/slurm.h>
#include "job_record.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
/*
* Create a new job_record containing the job completion information
*/
job_record job_record_create(uint32_t job_id, uint32_t user_id, char *job_name,
char *job_state, char *partition,
uint32_t limit, time_t start, time_t end, time_t submit,
uint16_t batch_flag, char *node_list, uint32_t num_procs,
char *account)
{
job_record ret;
ret = xmalloc(sizeof(struct job_record_));
ret->job_id = job_id;
ret->user_id = user_id;
ret->job_name = xstrdup(job_name);
ret->job_state = xstrdup(job_state);
ret->partition = xstrdup(partition);
ret->limit = limit;
ret->start = start;
ret->submit = submit;
ret->batch_flag = batch_flag;
ret->end = end;
ret->node_list = xstrdup(node_list);
ret->num_procs = num_procs;
if (account)
ret->account = xstrdup(account);
return ret;
}
/*
* Free the memory from a job_record structure
*/
void job_record_destroy(void *job) {
job_record j = job;
if (j == NULL)
return;
xfree(j->job_name);
xfree(j->partition);
xfree(j->node_list);
xfree(j->job_state);
xfree(j->account);
xfree(j);
}
/*****************************************************************************\
* job_record.h - Header for job_record structure.
*****************************************************************************
* Produced at Center for High Performance Computing, North Dakota State
* University
* Written by Nathan Huff <nhuff@geekshanty.com>
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifndef _JOB_RECORD_H
#define _JOB_RECORD_H
/* Structure to contain job completion information */
struct job_record_ {
uint32_t job_id;
uint32_t user_id;
char *job_name;
char *job_state;
char *partition;
uint32_t limit;
uint16_t batch_flag;
uint32_t num_procs;
time_t submit;
time_t start;
time_t end;
char *node_list;
char *account;
};
typedef struct job_record_ * job_record;
/* Create a job record */
job_record job_record_create(uint32_t job_id, uint32_t user_id, char *job_name,
char *job_state, char *partition,
uint32_t limit, time_t start, time_t end, time_t submit,
uint16_t batch_flag, char *node_list, uint32_t num_procs,
char *account);
/* Destroy a job record */
void job_record_destroy(void *j);
#endif /* _JOB_RECORD_H */
...@@ -47,12 +47,13 @@ ...@@ -47,12 +47,13 @@
# include <inttypes.h> # include <inttypes.h>
#endif #endif
#include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <stdarg.h>
#include <paths.h> #include <paths.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <fcntl.h>
#include <sys/wait.h> #include <sys/wait.h>
#include <unistd.h> #include <unistd.h>
#include <pthread.h> #include <pthread.h>
...@@ -65,7 +66,6 @@ ...@@ -65,7 +66,6 @@
#include "src/common/xstring.h" #include "src/common/xstring.h"
#include "src/common/list.h" #include "src/common/list.h"
#include "src/slurmctld/slurmctld.h" #include "src/slurmctld/slurmctld.h"
#include "job_record.h"
/* /*
* These variables are required by the generic plugin interface. If they * These variables are required by the generic plugin interface. If they
...@@ -100,9 +100,7 @@ const char plugin_name[] = "Job completion logging script plugin"; ...@@ -100,9 +100,7 @@ const char plugin_name[] = "Job completion logging script plugin";
const char plugin_type[] = "jobcomp/script"; const char plugin_type[] = "jobcomp/script";
const uint32_t plugin_version = 90; const uint32_t plugin_version = 90;
static int plugin_errno = SLURM_SUCCESS;
static char * script = NULL; static char * script = NULL;
static char error_str[256];
static List comp_list = NULL; static List comp_list = NULL;
static pthread_t script_thread = 0; static pthread_t script_thread = 0;
...@@ -111,222 +109,343 @@ static pthread_mutex_t comp_list_mutex = PTHREAD_MUTEX_INITIALIZER; ...@@ -111,222 +109,343 @@ static pthread_mutex_t comp_list_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t comp_list_cond = PTHREAD_COND_INITIALIZER; static pthread_cond_t comp_list_cond = PTHREAD_COND_INITIALIZER;
static int agent_exit = 0; static int agent_exit = 0;
/*
* Local plugin errno
*/
static int plugin_errno = SLURM_SUCCESS;
static struct jobcomp_errno {
int n;
const char *descr;
} errno_table [] = {
{ 0, "No Error" },
{ EACCES, "Script access denied" },
{ EEXIST, "Script does not exist" },
{ EINVAL, "JocCompLoc invalid" },
{ -1, "Unknown Error" }
};
/*
* Return string representation of plugin errno
*/
static const char * jobcomp_script_strerror (int errnum)
{
struct jobcomp_errno *ep = errno_table;
while ((ep->n != errnum) && (ep->n != -1))
ep++;
return (ep->descr);
}
/*
* Structure for holding job completion information for later
* use by script;
*/
struct jobcomp_info {
uint32_t jobid;
uint32_t uid;
uint32_t limit;
uint32_t nprocs;
uint16_t batch_flag;
time_t submit;
time_t start;
time_t end;
char *nodes;
char *name;
char *partition;
char *jobstate;
char *account;
};
struct jobcomp_info * jobcomp_info_create (struct job_record *job)
{
enum job_states state;
struct jobcomp_info * j = xmalloc (sizeof (*j));
j->jobid = job->job_id;
j->uid = job->user_id;
j->name = xstrdup (job->name);
/*
* Job will typically be COMPLETING when this code is called.
* We remove the COMPLETING flag to hopefully get the evenual
* completion state: e.g.: JOB_FAILED, TIMEOUT, ....
*/
state = job->job_state & (~JOB_COMPLETING);
j->jobstate = xstrdup (job_state_string (state));
j->partition = xstrdup (job->partition);
j->limit = job->time_limit;
j->start = job->start_time;
j->end = job->end_time;
j->submit = job->details ? job->details->submit_time:job->start_time;;
j->batch_flag = job->batch_flag;
j->nodes = xstrdup (job->nodes);
j->nprocs = job->num_procs;
j->account = job->account ? xstrdup (job->account) : NULL;
return (j);
}
void jobcomp_info_destroy (struct jobcomp_info *j)
{
if (j == NULL)
return;
xfree (j->name);
xfree (j->partition);
xfree (j->nodes);
xfree (j->jobstate);
xfree (j->account);
xfree (j);
}
/* /*
* Check if the script exists and if we can execute it. * Check if the script exists and if we can execute it.
*/ */
static int static int
_check_script_permissions(char * location) _check_script_permissions(char * path)
{ {
struct stat filestat; struct stat st;
uid_t user;
gid_t group;
if(stat(location,&filestat) < 0) { if (stat(path, &st) < 0) {
plugin_errno = errno; plugin_errno = errno;
return error("jobcomp/script script does not exist"); return error("jobcomp/script: %s does not exist", path);
} }
if(!(filestat.st_mode&S_IFREG)) { if (!(st.st_mode & S_IFREG)) {
plugin_errno = EACCES; plugin_errno = EACCES;
return error("jobcomp/script script isn't a regular file"); return error("jobcomp/script: %s isn't a regular file", path);
} }
user = geteuid(); if (access(path, X_OK) < 0) {
group = getegid(); plugin_errno = EACCES;
if(user == filestat.st_uid) { return error("jobcomp/script: %s is not executable", path);
if (filestat.st_mode&S_IXUSR) {
return SLURM_SUCCESS;
} else {
plugin_errno = EACCES;
return error("jobcomp/script script is not executable");
}
} else if(group == filestat.st_gid) {
if (filestat.st_mode&S_IXGRP) {
return SLURM_SUCCESS;
} else {
plugin_errno = EACCES;
return error("jobcomp/script script is not executable");
}
} else if(filestat.st_mode&S_IXOTH) {
return SLURM_SUCCESS;
} }
plugin_errno = EACCES; return SLURM_SUCCESS;
return error("jobcomp/script script is not executable");
} }
/* Write "name=value" to dest, returns pointer to after that key-pair string, static char ** _extend_env (char ***envp)
* where the next key-pair (if any) should be written. */
static char *_add_env(char *dest, char *name, char *value)
{ {
int len; char **ep;
size_t newcnt = (xsize (*envp) / sizeof (char *)) + 1;
*envp = xrealloc (*envp, newcnt * sizeof (char *));
len = sprintf(dest, "%s=%s", name, value); (*envp)[newcnt - 1] = NULL;
return dest + len + 1; ep = &((*envp)[newcnt - 2]);
/*
* Find last non-NULL entry
*/
while (*ep == NULL)
--ep;
return (++ep);
} }
/* Create a new environment pointer containing information from static int _env_append (char ***envp, const char *name, const char *val)
* slurm_jobcomp_log_record so that the script can access it.
*/
static char ** _create_environment(char *job, char *user, char *job_name,
char *job_state, char *partition, char *limit,
char* start, char * end, char * submit,
char * batch, char *node_list, char *procs, char *account)
{ {
int len = 0; char buf[4096];
char ** envptr; char *entry;
char **ep;
/* Increase ENV_COUNT if new environment variables are added */
#define ENV_COUNT 15 if (val == NULL)
val = "";
/* Add strlen() of any sizable fields added to the array */
len = strlen(job_name) + strlen(partition) + strlen(node_list) + snprintf (buf, sizeof (buf) - 1, "%s=%s", name, val);
strlen(account) + strlen(_PATH_STDPATH) + 1024;
if (!(entry = strdup (buf)))
if (!(envptr = (char **)try_xmalloc(len))) return (-1);
return NULL;
envptr[0] = (char *)envptr + (ENV_COUNT * sizeof(char *)); ep = _extend_env (envp);
*ep = entry;
/* NEXT_DEST = ADD_ENV(DEST, NAME=VALUE) */
envptr[1] = _add_env(envptr[0], "JOBID", job); return (0);
envptr[2] = _add_env(envptr[1], "UID", user); }
envptr[3] = _add_env(envptr[2], "JOBNAME", job_name);
envptr[4] = _add_env(envptr[3], "JOBSTATE", job_state);
envptr[5] = _add_env(envptr[4], "PARTITION", partition); static int _env_append_fmt (char ***envp, const char *name,
envptr[6] = _add_env(envptr[5], "LIMIT", limit); const char *fmt, ...)
envptr[7] = _add_env(envptr[6], "START", start); {
envptr[8] = _add_env(envptr[7], "END", end); char val[1024];
envptr[9] = _add_env(envptr[8], "NODES", node_list); va_list ap;
envptr[10] = _add_env(envptr[9], "SUBMIT", submit);
envptr[11] = _add_env(envptr[10], "BATCH", batch); va_start (ap, fmt);
envptr[12] = _add_env(envptr[11], "PROCS", procs); vsnprintf (val, sizeof (val) - 1, fmt, ap);
envptr[13] = _add_env(envptr[12], "ACCOUNT", account); va_end (ap);
return (_env_append (envp, name, val));
}
static char ** _create_environment (struct jobcomp_info *job)
{
char **env;
env = xmalloc (1 * sizeof (*env));
env[0] = NULL;
_env_append_fmt (&env, "JOBID", "%u", job->jobid);
_env_append_fmt (&env, "UID", "%u", job->uid);
_env_append_fmt (&env, "START", "%lu", job->start);
_env_append_fmt (&env, "END", "%lu", job->end);
_env_append_fmt (&env, "SUBMIT","%lu", job->submit);
_env_append_fmt (&env, "PROCS", "%u", job->nprocs);
_env_append (&env, "BATCH", (job->batch_flag ? "yes" : "no"));
_env_append (&env, "NODES", job->nodes);
_env_append (&env, "ACCOUNT", job->account);
_env_append (&env, "JOBNAME", job->name);
_env_append (&env, "JOBSTATE", job->jobstate);
_env_append (&env, "PARTITION", job->partition);
if (job->limit == INFINITE)
_env_append (&env, "LIMIT", "UNLIMITED");
else
_env_append_fmt (&env, "LIMIT", "%u", job->limit);
#ifdef _PATH_STDPATH #ifdef _PATH_STDPATH
_add_env(envptr[13], "PATH", _PATH_STDPATH); _env_append (&env, "PATH", _PATH_STDPATH);
#else #else
envptr[13] = NULL; _env_append (&env, "PATH", "/bin:/usr/bin");
#endif #endif
/* envptr[14] through envptr[ENV_COUNT] are NULL from xmalloc. return (env);
* Since last entry must be NULL, add new records between }
* "ACCOUNT" and "PATH" above. */
static int redirect_stdio (void)
return envptr; {
int devnull;
if ((devnull = open ("/dev/null", O_RDWR)) < 0)
return error ("jobcomp/script: Failed to open /dev/null: %m");
if (dup2 (devnull, STDIN_FILENO) < 0)
return error ("jobcomp/script: Failed to redirect stdin: %m");
if (dup2 (devnull, STDOUT_FILENO) < 0)
return error ("jobcomp/script: Failed to redirect stdout: %m");
if (dup2 (devnull, STDERR_FILENO) < 0)
return error ("jobcomp/script: Failed to redirect stderr: %m");
close (devnull);
return (0);
}
static void jobcomp_child (char * script, char **env)
{
char * args[] = {script, NULL};
const char *tmpdir;
#ifdef _PATH_TMP
tmpdir = _PATH_TMP;
#else
tmpdir = "/tmp";
#endif
/*
* Reinitialize log so we can log any errors for
* diagnosis
*/
log_reinit ();
if (redirect_stdio () < 0)
exit (1);
if (chdir (tmpdir) != 0) {
error ("jobcomp/script: chdir (%s): %m", _PATH_TMP);
exit(1);
}
execve(script, args, env);
/*
* Failure of execve implies error
*/
error ("jobcomp/script: execve(%s): %m\n", script);
exit (1);
} }
static int jobcomp_exec_child (char *script, char **env)
{
pid_t pid;
int status = 0;
if (script == NULL || env == NULL)
return (-1);
if ((pid = fork()) < 0) {
error ("jobcomp/script: fork: %m");
return (-1);
}
if (pid == 0)
jobcomp_child (script, env);
/*
* Parent continues
*/
if (waitpid(pid, &status, 0) < 0)
error ("jobcomp/script: waitpid: %m");
if (WEXITSTATUS(status))
error ("jobcomp/script: script %s exited with status %d\n",
script, WEXITSTATUS(status));
return (0);
}
/* /*
* Thread function that executes a script * Thread function that executes a script
*/ */
void *script_agent (void *args) { void * script_agent (void *args)
pid_t pid = -1; {
char user_id_str[32],job_id_str[32], nodes_cache[1]; info ("jobcomp_script_strerror (137) = %s\n",
char start_str[32], end_str[32], lim_str[32]; jobcomp_script_strerror (137));
char submit_str[32], procs_str[32], account_str[128];
char * argvp[] = {script, NULL}; while (1) {
int status; struct jobcomp_info *job;
char ** envp, *nodes, *batch_str;
job_record job;
while(1) {
pthread_mutex_lock(&comp_list_mutex); pthread_mutex_lock(&comp_list_mutex);
while ((list_is_empty(comp_list) != 0) && (agent_exit == 0)) {
pthread_cond_wait(&comp_list_cond, &comp_list_mutex);
}
if (agent_exit) {
pthread_mutex_unlock(&comp_list_mutex);
return NULL;
}
job = (job_record)list_pop(comp_list);
pthread_mutex_unlock(&comp_list_mutex);
snprintf(user_id_str,sizeof(user_id_str),"%u",job->user_id); if (list_is_empty(comp_list) && !agent_exit)
snprintf(job_id_str,sizeof(job_id_str),"%u",job->job_id); pthread_cond_wait(&comp_list_cond, &comp_list_mutex);
snprintf(start_str, sizeof(start_str),"%lu",
(unsigned long)job->start);
snprintf(end_str, sizeof(end_str),"%lu",
(unsigned long)job->end);
snprintf(submit_str, sizeof(submit_str),"%lu",
(unsigned long)job->submit);
if (job->batch_flag)
batch_str = "yes";
else
batch_str = "no";
nodes_cache[0] = '\0';
if (job->limit == INFINITE) {
strcpy(lim_str, "UNLIMITED");
} else {
snprintf(lim_str, sizeof(lim_str), "%lu",
(unsigned long) job->limit);
}
if (job->node_list == NULL) {
nodes = nodes_cache;
} else {
nodes = job->node_list;
}
snprintf(procs_str, sizeof(procs_str), "%lu", /*
(unsigned long) job->num_procs); * It is safe to unlock list mutex here. List has its
if (job->account) { * own internal mutex that protects the comp_list itself
snprintf(account_str, sizeof(account_str), "%s", */
job->account); pthread_mutex_unlock(&comp_list_mutex);
} else
account_str[0] = '\0';
/* Setup environment*/
envp = _create_environment(job_id_str,user_id_str,
job->job_name, job->job_state,
job->partition, lim_str,
start_str, end_str, submit_str,
batch_str, nodes, procs_str,
account_str);
if(envp == NULL) {
plugin_errno = ENOMEM;
}
pid = fork(); if ((job = list_pop(comp_list))) {
char **envp;
if (pid < 0) { if ((envp = _create_environment (job)) == NULL) {
xfree(envp); error ("jobcomp/script: create env failed!");
job_record_destroy((void *)job); jobcomp_info_destroy (job);
plugin_errno = errno; continue;
} else if (pid == 0) {
/*Change directory to tmp*/
if (chdir(_PATH_TMP) != 0) {
exit(errno);
} }
/*Redirect stdin, stderr, and stdout to /dev/null*/ if (jobcomp_exec_child (script, envp) < 0)
if (freopen(_PATH_DEVNULL, "rb", stdin) == NULL) { error ("jobcomp/script: %s failed");
exit(errno);
}
if (freopen(_PATH_DEVNULL, "wb", stdout) == NULL) {
exit(errno);
}
if (freopen(_PATH_DEVNULL, "wb", stderr) == NULL) {
exit(errno);
}
/*Exec Script*/
execve(script,argvp,envp);
} else {
xfree(envp); xfree(envp);
job_record_destroy((void *)job); jobcomp_info_destroy (job);
waitpid(pid,&status,0);
if(WIFEXITED(status) && WEXITSTATUS(status)) {
plugin_errno = WEXITSTATUS(status);
}
} }
/*
* Exit if flag is set and we have no more entries to log
*/
if (agent_exit && list_is_empty (comp_list))
break;
} }
return NULL;
} }
/* /*
* init() is called when the plugin is loaded, before any other functions * init() is called when the plugin is loaded, before any other functions
* are called. Put global initialization here. * are called. Put global initialization here.
*/ */
int init ( void ) int init (void)
{ {
pthread_attr_t attr; pthread_attr_t attr;
...@@ -334,18 +453,17 @@ int init ( void ) ...@@ -334,18 +453,17 @@ int init ( void )
pthread_mutex_lock(&thread_flag_mutex); pthread_mutex_lock(&thread_flag_mutex);
comp_list = list_create((ListDelF) job_record_destroy); if (!(comp_list = list_create((ListDelF) jobcomp_info_destroy)))
if(comp_list == NULL) {
return SLURM_ERROR; return SLURM_ERROR;
}
if (script_thread) { if (script_thread) {
debug2( "Script thread already running, not starting another"); debug2( "Script thread already running, not starting another");
pthread_mutex_unlock(&thread_flag_mutex); pthread_mutex_unlock(&thread_flag_mutex);
return SLURM_ERROR; return SLURM_ERROR;
} }
slurm_attr_init(&attr); slurm_attr_init(&attr);
pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_DETACHED ); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
pthread_create(&script_thread, &attr, script_agent, NULL); pthread_create(&script_thread, &attr, script_agent, NULL);
pthread_mutex_unlock(&thread_flag_mutex); pthread_mutex_unlock(&thread_flag_mutex);
...@@ -355,16 +473,15 @@ int init ( void ) ...@@ -355,16 +473,15 @@ int init ( void )
} }
/* Set the location of the script to run*/ /* Set the location of the script to run*/
int slurm_jobcomp_set_location ( char * location ) int slurm_jobcomp_set_location (char * location)
{ {
if (location == NULL) { if (location == NULL) {
plugin_errno = EACCES; plugin_errno = EACCES;
return error("jobcomp/script JobCompLoc needs to be set"); return error("jobcomp/script JobCompLoc needs to be set");
} }
if (_check_script_permissions(location) != SLURM_SUCCESS) { if (_check_script_permissions(location) != SLURM_SUCCESS)
return SLURM_ERROR; return SLURM_ERROR;
}
xfree(script); xfree(script);
script = xstrdup(location); script = xstrdup(location);
...@@ -372,47 +489,34 @@ int slurm_jobcomp_set_location ( char * location ) ...@@ -372,47 +489,34 @@ int slurm_jobcomp_set_location ( char * location )
return SLURM_SUCCESS; return SLURM_SUCCESS;
} }
int slurm_jobcomp_log_record ( struct job_record *job_ptr ) int slurm_jobcomp_log_record (struct job_record *record)
{ {
job_record job; struct jobcomp_info * job;
enum job_states job_state;
time_t submit;
debug3("Entering slurm_jobcomp_log_record"); debug3("Entering slurm_jobcomp_log_record");
/* Job will typically be COMPLETING when this is called.
* We remove this flag to get the eventual completion state: if ((job = jobcomp_info_create (record)))
* JOB_FAILED, JOB_TIMEOUT, etc. */ return error ("jobcomp/script: Failed to create job info!");
job_state = job_ptr->job_state & (~JOB_COMPLETING);
if (job_ptr->details)
submit = job_ptr->details->submit_time;
else
submit = job_ptr->start_time;
job = job_record_create(job_ptr->job_id, job_ptr->user_id, job_ptr->name,
job_state_string(job_state),
job_ptr->partition, job_ptr->time_limit,
job_ptr->start_time, job_ptr->end_time,
submit, job_ptr->batch_flag, job_ptr->nodes,
job_ptr->num_procs, job_ptr->account);
pthread_mutex_lock(&comp_list_mutex); pthread_mutex_lock(&comp_list_mutex);
list_append(comp_list,(void *)job); list_append(comp_list, job);
pthread_mutex_unlock(&comp_list_mutex); pthread_mutex_unlock(&comp_list_mutex);
pthread_cond_broadcast(&comp_list_cond); pthread_cond_broadcast(&comp_list_cond);
return SLURM_SUCCESS; return SLURM_SUCCESS;
} }
/* Return the error code of the plugin*/ /* Return the error code of the plugin*/
int slurm_jobcomp_get_errno( void ) int slurm_jobcomp_get_errno(void)
{ {
return plugin_errno; return plugin_errno;
} }
/* Return a string representation of the error */ /* Return a string representation of the error */
char *slurm_jobcomp_strerror( int errnum ) const char * slurm_jobcomp_strerror(int errnum)
{ {
strerror_r(errnum,error_str,sizeof(error_str)); return jobcomp_script_strerror (errnum);
return error_str;
} }
static int _wait_for_thread (pthread_t thread_id) static int _wait_for_thread (pthread_t thread_id)
...@@ -424,7 +528,13 @@ static int _wait_for_thread (pthread_t thread_id) ...@@ -424,7 +528,13 @@ static int _wait_for_thread (pthread_t thread_id)
return SLURM_SUCCESS; return SLURM_SUCCESS;
usleep(1000); usleep(1000);
} }
error("Could not kill jobcomp script pthread");
/*
* jobcomp thread appears to be stuck, try harder:
*/
if (pthread_kill(thread_id, SIGKILL))
error("Could not kill jobcomp script pthread");
return SLURM_ERROR; return SLURM_ERROR;
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment