diff --git a/NEWS b/NEWS index a87ff6cc0adbada7823379e5d47a69ad1db46ae0..7a47fc4f595a27e6e37496534f777a740b50fefb 100644 --- a/NEWS +++ b/NEWS @@ -25,6 +25,7 @@ documents those changes that are of interest to users and admins. -- squeue - Remove extra whitespace of default printout. -- BGQ - added head ppcfloor as an include dir when building. -- BGQ - Better debug messages in runjob_mux plugin. + -- PMI2 Updated the Makefile.am to build a versioned library. * Changes in Slurm 2.6.0 ======================== diff --git a/slurm/doc/html/hdf5_profile_user_guide.shtml b/slurm/doc/html/hdf5_profile_user_guide.shtml deleted file mode 100644 index 73c6502f3a0e1bb564ae1ff4563f3bce80c2b7b6..0000000000000000000000000000000000000000 --- a/slurm/doc/html/hdf5_profile_user_guide.shtml +++ /dev/null @@ -1,336 +0,0 @@ -<!--#include virtual="header.txt"--> -<!-- Copyright (C) 2013 Bull S. A. S. - Bull, Rue Jean Jaures, B.P.68, 78340, Les Clayes-sous-Bois. --> - -<h1>Profiling Using HDF5 User Guide</h1> - -<h2>Contents</h2> -<a href="#Overview">Overview</a><br> -<a href="#Administration">Administration</a><br> -<a href="#Profiling">Profiling Jobs</a><br> -<a href="#HDF5">HDF5</a><br> -<a href="#DataSeries">Data Series</a><br> - - - -<a id="Overview"></a> -<h2>Overview</h2> -The AcctGatherProfileType/hdf5 plugin allows SLURM to coordinate collecting -data on jobs it runs on a cluster that is more detailed than is practical to -include in its database. The data comes from periodically sampling various -performance data either collected by SLURM, the operating system, or -component software. The plugin will record the data from each source -as a <b>Time Series</b> and also accumulate totals for each statistic for -the job. - -<p>Time Series are energy data collected by an AcctGatherEnergy plugin, -I/O data from a network interface collected by an AcctGatherInfiniband plugin, -I/O data from parallel file systems such as Lustre, -and task performance data such as local disk I/O, cpu consumption, -and memory us, as well as potential data from other sources. - -<p>The data is collected into a file on a shared file system for each step on -each allocated node of a job and then merged into a HDF5 file. -Individual files on a shared file system was chosen because it is possible -that the data is voluminous so solutions that pass data to the SLURM control -daemon via RPC may not scale to very large clusters or jobs with -many allocated nodes. - -<p>A seperate <a href="acct_gather_profile_plugins.html"> -SLURM Profile Accounting Plugin API (AcctGatherProfileType)</a> documents how -write other Profile Accounting plugins. - -<a id="Administration"></a> -<h2>Administration</h2> - -<h3>Shared File System</h3> -<div style="margin-left: 20px;"> -The HDF5 Profile Plugin requires a common shared file system on all the compute -nodes. While a job is running, the plugin writes a file into this file -system for each step of the job on each node. When the job ends, -the merge process is launched and the node-step files are combined into one -hdf5 file for the job. -<p> -The root of the directory structure is declared in the <b>ProfileHDF5Dir</b> -option in the acct_gather.conf file. The directory will be created by slurm -if it doesn't exist. -<p> -Each user that creates a profile will have a subdirector to the profile -directory that has read/write permission only for the user. -</span> -</div> -<h3>Configuration parameters</h3> - -<div style="margin-left: 20px;"> -The profile plugin is enabled in the -<a href="slurm.conf.html">slurm.conf</a> file, but is internally -configured in the -<a href="acct_gather.conf.html">acct_gather.conf</a> file. -</div> -<div style="margin-left: 20px;"> -<h4>slurm.conf parameters</h4> -<div style="margin-left: 20px;"> -This line the slum.conf enables the HDF5 Profile Plugin. -<br><b>AcctGatherProfileType=acct_gather_profile/hdf5</b> -</div> -</div> -<div style="margin-left: 20px;"> -<h4>act_gather.conf parameters</h4> -<div style="margin-left: 20px;"> -There are parameters directly used by the HDF5 Profile Plugin. -<dl> -<dt><B>ProfileHDF5Dir</B>=<path></dt> -<dd>This parameter is the path to the shared folder into which the -acct_gather_profile plugin will write detailed data as an HDF5 file. -The directory is assumed to be on a file system shared by the controller and -all compute nodes. This is a required parameter. -<dt><B>ProfileHDF5CollectDefault</B>=opt{,opt{,opt}}</dt> -<dd>Default <b>--Profile</b value> for data types collected for each job -submission. It ia a comma separated list of data streams. -Use this option with caution. A node-step file will be created for on every -node of every step for every job. They will not automatically be merged -into job files. (Even job files for small jobs would fill the fill the -file system.) This option is intended for test environments where you -might want to profile a series of jobs but do not want to have to -add the --profile option to the launch scripts.</dd> -</dl> -</div> -</div> - - -<div style="margin-left: 20px;"> -<h4>Time Series Control Paramters</h4> -<div style="margin-left: 20px;"> -Other plugins add time series data to the HDF5 collection. They typically -have a polling frequency specified in one of the above configuration files. -<p> -The following table summarized parameters that control sample frequency. -<p> -<table border="1" style="margin-left: 20; padding: 5 -px;" > -<tr><th>Conf file</th><th>Parameter</th><th>Time Series</th></tr> -<tr><td>slurm.conf</td><td>JobAcctGatherFrequency</td><td>Task, Lustre</td></tr> -<tr><td>acct_gather.conf</td><td>EnergyIPMIFrequency</td><td>Energy</td></tr> -<tr><td>acct_gather.conf</td><td>InfinibandOFEDFrequency</td> -<td>Network</td></tr> -</table> -</div> -</div> -<a id="Profiling"></a> -<h2>Profiling Jobs</h2> -<h3>Data Collection</h3> -The --profile option on salloc|sbatch|srun controls whether if data is -collected and what type of data is collected. If --profile is not specified -the default is no data collected (unless the <B>ProfileHDF5CollectDefault</B> -option is used in acct_gather.conf. --profile on the command line overrides -any value specified in the configuration file.)<p> - -<DT><B>--profile</B>=<all|none|[energy[,|task[,|lustre[,|network]]]]> -<DD> -enables detailed data collection by the acct_gather_profile plugin. -Detailed data are typically time-series that are stored in an HDF5 file for -the job.</DD> -</DT> -<P> -<div style="margin-left: 20px;"> -<DL> -<DT><B>All</B> -<DD>All data types are collected. (Cannot be combined with other values.) -</DD></DT> -<P> -<DT><B>None</B> -<DD>No data types are collected. This is the default. -<BR> (Cannot be combined with other values.) -</DD></DT> - -<DT><B>Energy</B> -<DD>Energy data is collected.</DD></DT> - -<DT><B>Task</B> -<DD>Task (I/O, Memory, ...) data is collected.</DD></DT> - -<DT><B>Lustre</B> -<DD>Lustre data is collected.</DD></DT> - -<DT><B>Network</B> -<DD>Network (InfiniBand) data is collected.</DD></DT> - -</DL> -</div> - -<h3>Data Consolidation</h3> -The node-step files are merged into one HDF5 file for the job using the -<a href="sh5util.html">sh5util</a>. - -<p>The command line may added to the normal launch script, if the job is -started with sbatch. For example; -<pre> -sbatch -n1 -d$last_job_id --wrap="sh5util --profile=none -j $last_job_id" -</pre> -Note that --profile=none is required if the enclosing sbatch command included -a --profile parameter. - -<h3>Data Extraction</h3> -The <a href="sh5util.html">sh5util</a> program can also be used to extract -specific data from the hdf5 file an write it in <i>comma separated value</i> -for importation into other analysis tools such as spreadsheets. - -<a id="HDF5"></a> -<h2>HDF5</h2> -HDF5 is a well known structured data set that allows heterogeneous data but -related data to be stored in one file. -(.i.e. sections for energy statistics, sections for network I/O, -sections for Task data, …) -Its internal structure resembles a -file system with <b>groups</b> being similar to <i>directories</i> and -<b>data sets</b> being similar to <i>files</i>. It also allows <b>attributes</b> -to be attached to groups to store application defined properties. - -<p>There are commodity programs, notably -<a href="http://www.hdfgroup.org/hdf-java-html/hdfview/index.html"> -HDFView</a> for viewing and manipulating these files. - -<p>Below is a screen shot from HDFView expanding the job tree and showing the -attributes for a specific task. -<p> -<img src="hdf5_task_attr.png" width="275" height="275" > - - -<a id="DataSeries"></a> -<h2>Data Structure</h2> - -<table> -<tr> -<td><img src="hdf5_job_outline.png" width="205" height="570"></td> -<td style="vertical-align: top;"> -<div style="margin-left: 5px;"> -In the job file, there will be a group for each <b>step</b> of the job. -Within each step, there will be a group for nodes, and a group for tasks. -</div> -<ul> -<li> -The <b>nodes</b> group will have a group for each node in the step allocation. -For each node group, there is a sub-group for Time Series and another -for Totals. -<ul> -<li> -The <b>Time Series</b> group -contains a group/dataset containing the time series for each collector. -</li> -<li> -The <b>Totals</b> group contains a corresponding group/dataset that has the -Minimum, Average, Maximum, and Sum Total for each item in the time series. -</li> -</ul> -<li> -The <b>Tasks</b> group will only contain a subgroup for each task. -It primarily contains an attribute stating the node on which the task was -executed. This set of groups is essentially a cross reference table. -</li> -</ul> -</td></tr> -</table> - -<h3>Energy Data</h3> -<b>AcctGatherEnergyType=acct_gather_energy/ipmi</b> -is required in slurm.conf to collect energy data. -Also appropriately set -<b>EnergyIPMIFrequency</b> -in acct_gather.conv -<DL> -<DT><B>Date Time</B> -<DD>Time of day at which the data sample was taken. This can be used to -correlate activity with other sources such as logs.</DD></DT> -<DT><B>Time</B> -<DD>Elapsed time since the begining of the step.</DD></DT> -<DT><B>Power</B> -<DD>Power consumption during the interval.</DD></DT> -<DT><B>CPU Frequency</B> -<DD>CPU Frequency at time of sample in kilohertz.</DD></DT> -</DL> - -<h3>Infiniband Data</h3> -<b>JobAcctInfinibandType=acct_gather_infiniband/ofed</b> -is required in slurm.conf to collect task data. -Also appropriately set -<b>InfinibandOFEDFrequency</b> -in acct_gather.conf -Each data sample in the Lustre Time Series contains the following data items. -<DL> -<DT><B>Date Time</B> -<DD>Time of day at which the data sample was taken. This can be used to -correlate activity with other sources such as logs.</DD></DT> -<DT><B>Time</B> -<DD>Elapsed time since the begining of the step.</DD></DT> -<DT><B>Packets In</B> -<DD>Number of packets coming in.</DD></DT> -<DT><B>Megabytes Read</B> -<DD>Number of megabytes coming in through the interface.</DD></DT> -<DT><B>Packets Out</B> -<DD>Number of packets going out.</DD></DT> -<DT><B>Megabytes Write</B> -<DD>Number of megabytes going out through the interface.</DD></DT> -</DL> - -<h3>Luster Data</h3> -<b>JobAcctGatherType=jobacct_gather/linux</b> -is required in slurm.conf to collect task data. -Also appropriately set -<b>JobAcctGatherFrequency</b> -in slurm.conf -<p> -Each data sample in the Lustre Time Series contains the following data items. -<DL> -<DT><B>Date Time</B> -<DD>Time of day at which the data sample was taken. This can be used to -correlate activity with other sources such as logs.</DD></DT> -<DT><B>Time</B> -<DD>Elapsed time since the begining of the step.</DD></DT> -<DT><B>Reads</B> -<DD>Number of read operations.</DD></DT> -<DT><B>Megabytes Read</B> -<DD>Number of megabytes read.</DD></DT> -<DT><B>Writes</B> -<DD>Number of write operations.</DD></DT> -<DT><B>Megabytes Write</B> -<DD>Number of megabytes written.</DD></DT> -</DL> - -<h3>Task Data</h3> -<b>JobAcctGatherType=jobacct_gather/linux</b> -is required in slurm.conf to collect task data. -Also appropriately set -<b>JobAcctGatherFrequency</b> -in slurm.conf -<DL> -<DT><B>Date Time</B> -<DD>Time of day at which the data sample was taken. This can be used to -correlate activity with other sources such as logs.</DD></DT> -<DT><B>Time</B> -<DD>Elapsed time since the begining of the step.</DD></DT> -<DT><B>CPU Frequency</B> -<DD>CPU Frequency at time of sample.</DD></DT> -<DT><B>CPU Time</B> -<DD>Seconds of CPU time used during the sample.</DD></DT> -<DT><B>CPU Utilization</B> -<DD>CPU Utilization during the interval.</DD></DT> - - -<DT><B>RSS</B> -<DD>Value of RSS at time of sample.</DD></DT> -<DT><B>VM Size</B> -<DD>Value of VM Size at time of sample.</DD></DT> -<DT><B>Pages</B> -<DD>Pages used in sample.</DD></DT> -<DT><B>Read Megabytes</B> -<DD>Number of megabytes read from local disk.</DD></DT> -<DT><B>Write Megabytes</B> -<DD>Number of megabytes written to local disk.</DD></DT> -</DL> - - -<p class="footer"><a href="#top">top</a></p> - -<p style="text-align:center;">Last modified 17 May 2013</p> - -<!--#include virtual="footer.txt"--> diff --git a/src/plugins/proctrack/cray/Makefile.am b/src/plugins/proctrack/cray/Makefile.am index 620a260cf87b8585744112168b3b529bad031b87..8d3ba4f4b25ac643077c18f3c8a1858bbbe6910f 100644 --- a/src/plugins/proctrack/cray/Makefile.am +++ b/src/plugins/proctrack/cray/Makefile.am @@ -4,11 +4,12 @@ AUTOMAKE_OPTIONS = foreign PLUGIN_FLAGS = - -module -avoid-version --export-dynamic -AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/src/common +AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/src/common $(CRAY_CPPFLAGS) -pkglib_LTLIBRARIES = proctrack_cray.la +if HAVE_REAL_CRAY +PROC_PLUG = proctrack_cray.la +endif +pkglib_LTLIBRARIES = $(PROC_PLUG) proctrack_cray_la_SOURCES = proctrack_cray.c -proctrack_cray_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) $(OTHER_FLAGS) -# Don't need to add -ljob because we dlopen the .so to avoid -# symbol collisions with slurm functions +proctrack_cray_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) $(CRAY_LDFLAGS) diff --git a/src/plugins/proctrack/cray/Makefile.in b/src/plugins/proctrack/cray/Makefile.in index 2b9a5a531314a0a0233134d72e353a3501991a7f..9cd8948777ccd9cf467516587bd4cef38d6ceabc 100644 --- a/src/plugins/proctrack/cray/Makefile.in +++ b/src/plugins/proctrack/cray/Makefile.in @@ -139,6 +139,7 @@ proctrack_cray_la_OBJECTS = $(am_proctrack_cray_la_OBJECTS) proctrack_cray_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ $(proctrack_cray_la_LDFLAGS) $(LDFLAGS) -o $@ +@HAVE_REAL_CRAY_TRUE@am_proctrack_cray_la_rpath = -rpath $(pkglibdir) DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) -I$(top_builddir)/slurm depcomp = $(SHELL) $(top_srcdir)/auxdir/depcomp am__depfiles_maybe = depfiles @@ -395,10 +396,11 @@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ AUTOMAKE_OPTIONS = foreign PLUGIN_FLAGS = - -module -avoid-version --export-dynamic -AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/src/common -pkglib_LTLIBRARIES = proctrack_cray.la +AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/src/common $(CRAY_CPPFLAGS) +@HAVE_REAL_CRAY_TRUE@PROC_PLUG = proctrack_cray.la +pkglib_LTLIBRARIES = $(PROC_PLUG) proctrack_cray_la_SOURCES = proctrack_cray.c -proctrack_cray_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) $(OTHER_FLAGS) +proctrack_cray_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) $(CRAY_LDFLAGS) all: all-am .SUFFIXES: @@ -466,7 +468,7 @@ clean-pkglibLTLIBRARIES: rm -f "$${dir}/so_locations"; \ done proctrack_cray.la: $(proctrack_cray_la_OBJECTS) $(proctrack_cray_la_DEPENDENCIES) $(EXTRA_proctrack_cray_la_DEPENDENCIES) - $(proctrack_cray_la_LINK) -rpath $(pkglibdir) $(proctrack_cray_la_OBJECTS) $(proctrack_cray_la_LIBADD) $(LIBS) + $(proctrack_cray_la_LINK) $(am_proctrack_cray_la_rpath) $(proctrack_cray_la_OBJECTS) $(proctrack_cray_la_LIBADD) $(LIBS) mostlyclean-compile: -rm -f *.$(OBJEXT) @@ -709,8 +711,6 @@ uninstall-am: uninstall-pkglibLTLIBRARIES mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ tags uninstall uninstall-am uninstall-pkglibLTLIBRARIES -# Don't need to add -ljob because we dlopen the .so to avoid -# symbol collisions with slurm functions # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. diff --git a/src/plugins/proctrack/cray/proctrack_cray.c b/src/plugins/proctrack/cray/proctrack_cray.c index dae07097e6d1b437ee98315620ab58a8ca40daa9..ef8c89b0b0c233553553ab6f5659fb3caae5cfba 100644 --- a/src/plugins/proctrack/cray/proctrack_cray.c +++ b/src/plugins/proctrack/cray/proctrack_cray.c @@ -53,6 +53,8 @@ #include <unistd.h> #include <dlfcn.h> +#include <job.h> /* Cray's job module component */ + #include "slurm/slurm.h" #include "slurm/slurm_errno.h" #include "src/common/log.h" @@ -64,22 +66,6 @@ const char plugin_name[] = "Process tracking via Cray job module"; const char plugin_type[] = "proctrack/cray"; const uint32_t plugin_version = 91; -/* - * We can't include <job.h> since its prototypes conflict with some - * of SLURM's. Instead, put important function protypes and - * the jid_t typedef here: - */ -typedef uint64_t jid_t; - -typedef jid_t (*create_f) (jid_t jid_requested, uid_t uid, int options); -typedef jid_t (*getjid_f) (pid_t pid); -typedef jid_t (*waitjid_f) (jid_t jid, int *status, int options); -typedef int (*killjid_f) (jid_t jid, int sig); -typedef jid_t (*detachpid_f) (pid_t pid); -typedef jid_t (*attachpid_f) (pid_t pid, jid_t jid_requested); -typedef int (*getpidlist_f)(jid_t jid, pid_t *pid, int bufsize); -typedef int (*getpidcnt_f) (jid_t jid); - /* * Handle to libjob.so */ @@ -88,67 +74,11 @@ static pthread_t threadid = 0; static pthread_cond_t notify = PTHREAD_COND_INITIALIZER; static pthread_mutex_t notify_mutex = PTHREAD_MUTEX_INITIALIZER; -/* - * libjob operations we'll need in this plugin - */ -static struct job_operations { - create_f create; - getjid_f getjid; - waitjid_f waitjid; - killjid_f killjid; - detachpid_f detachpid; - attachpid_f attachpid; - getpidlist_f getpidlist; - getpidcnt_f getpidcnt; -} job_ops; - - -static jid_t _job_create(jid_t jid, uid_t uid, int options) -{ - return ((*job_ops.create)(jid, uid, options)); -} - -static jid_t _job_getjid(pid_t pid) -{ - return ((*job_ops.getjid)(pid)); -} - -static jid_t _job_waitjid(jid_t jid, int *status, int options) -{ - return ((*job_ops.waitjid)(jid, status, options)); -} - -static int _job_killjid(jid_t jid, int sig) -{ - return ((*job_ops.killjid)(jid, sig)); -} - -/* not used */ -/* static int _job_detachpid(pid_t pid) */ -/* { */ -/* return ((*job_ops.detachpid)(pid)); */ -/* } */ - -static int _job_attachpid(pid_t pid, jid_t jid) -{ - return ((*job_ops.attachpid)(pid, jid)); -} - -static int _job_getpidlist(jid_t jid, pid_t *pid, int bufsize) -{ - return ((*job_ops.getpidlist)(jid, pid, bufsize)); -} - -static int _job_getpidcnt(jid_t jid) -{ - return ((*job_ops.getpidcnt)(jid)); -} - static void *_create_container_thread(void *args) { stepd_step_rec_t *job = (stepd_step_rec_t *)args; - if ((job->cont_id = (uint64_t)_job_create(0, job->uid, 0)) + if ((job->cont_id = (uint64_t)job_create(0, job->uid, 0)) == (jid_t)-1) { error ("Failed to create job container: %m"); return NULL; @@ -169,55 +99,37 @@ static void *_create_container_thread(void *args) return NULL; } +static void _end_container_thread(void) +{ + if (threadid) { + /* This will end the thread and remove it from the container */ + slurm_mutex_lock(¬ify_mutex); + pthread_cond_signal(¬ify); + slurm_mutex_unlock(¬ify_mutex); + + pthread_join(threadid, NULL); + threadid = 0; + } +} + /* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ extern int init(void) { - /* We dlopen() libjob.so instead of directly linking to it - * because of symbols like "job_create" in libjob which - * conflict with symbols in slurmd. dlopening the library - * prevents these symbols from going into the global namespace. - */ - if ((libjob_handle = dlopen("libjob.so", RTLD_LAZY)) == NULL) { - error ("Unable to open libjob.so: %m"); - return SLURM_ERROR; - } - - job_ops.create = dlsym(libjob_handle, "job_create"); - job_ops.getjid = dlsym(libjob_handle, "job_getjid"); - job_ops.waitjid = dlsym(libjob_handle, "job_waitjid"); - job_ops.killjid = dlsym(libjob_handle, "job_killjid"); - job_ops.detachpid = dlsym(libjob_handle, "job_detachpid"); - job_ops.attachpid = dlsym(libjob_handle, "job_attachpid"); - job_ops.getpidlist= dlsym(libjob_handle, "job_getpidlist"); - job_ops.getpidcnt = dlsym(libjob_handle, "job_getpidcnt"); - - if (!job_ops.create) - error("Unable to resolve job_create in libjob.so"); - if (!job_ops.getjid) - error("Unable to resolve job_getjid in libjob.so"); - if (!job_ops.waitjid) - error("Unable to resolve job_waitjid in libjob.so"); - if (!job_ops.killjid) - error("Unable to resolve job_killjid in libjob.so"); - if (!job_ops.detachpid) - error("Unable to resolve job_detachpid in libjob.so"); - if (!job_ops.attachpid) - error("Unable to resolve job_attachpid in libjob.so"); - if (!job_ops.getpidlist) - error("Unable to resolve job_getpidlist in libjob.so"); - if (!job_ops.getpidcnt) - error("Unable to resolve job_getpidcnt in libjob.so"); - - debug ("successfully loaded libjob.so"); + debug("%s loaded", plugin_name); return SLURM_SUCCESS; } extern int fini(void) { - dlclose(libjob_handle); + _end_container_thread(); + + /* free up some memory */ + slurm_mutex_destroy(¬ify_mutex); + pthread_cond_destroy(¬ify); + return SLURM_SUCCESS; } @@ -242,14 +154,22 @@ extern int slurm_container_plugin_create(stepd_step_rec_t *job) container automatically. Empty containers are not valid. */ + if (threadid) { + debug("Had a thread already %d", threadid); + slurm_mutex_lock(¬ify_mutex); + pthread_cond_wait(¬ify); + slurm_mutex_unlock(¬ify_mutex); + debug("Last thread done %d", threadid); + } pthread_attr_init(&attr); pthread_create(&threadid, &attr, _create_container_thread, job); slurm_mutex_lock(¬ify_mutex); pthread_cond_wait(¬ify, ¬ify_mutex); slurm_mutex_unlock(¬ify_mutex); - debug("slurm_container_plugin_create: created jid 0x%08lx", - job->cont_id); + debug("slurm_container_plugin_create: created jid " + "0x%08lx thread %d", + job->cont_id, threadid); } else error("slurm_container_plugin_create: already have a cont_id"); @@ -263,35 +183,17 @@ extern int slurm_container_plugin_create(stepd_step_rec_t *job) * (once) at this time. */ int slurm_container_plugin_add(stepd_step_rec_t *job, pid_t pid) { - static bool first = 1; - - if (_job_attachpid(pid, job->cont_id) == (jid_t) -1) { + if (job_attachpid(pid, job->cont_id) == (jid_t) -1) error("Failed to attach pid %d to job container: %m", pid); - return SLURM_ERROR; - } - - if (!first) - return SLURM_SUCCESS; - first = 0; - - /* This will end the thread and remove it from the container */ - slurm_mutex_lock(¬ify_mutex); - pthread_cond_signal(¬ify); - slurm_mutex_unlock(¬ify_mutex); - - pthread_join(threadid, NULL); - - /* free up some memory */ - slurm_mutex_destroy(¬ify_mutex); - pthread_cond_destroy(¬ify); + _end_container_thread(); return SLURM_SUCCESS; } int slurm_container_plugin_signal(uint64_t id, int sig) { - if ((_job_killjid((jid_t) id, sig) < 0) + if ((job_killjid((jid_t) id, sig) < 0) && (errno != ENODATA) && (errno != EBADF) ) return (SLURM_ERROR); return (SLURM_SUCCESS); @@ -300,6 +202,9 @@ int slurm_container_plugin_signal(uint64_t id, int sig) int slurm_container_plugin_destroy(uint64_t id) { int status; + + debug("destroying 0x%08lx %d", id, threadid); + _job_waitjid((jid_t) id, &status, 0); /* Assume any error means job doesn't exist. Therefore, * return SUCCESS to slurmd so it doesn't retry continuously @@ -311,7 +216,7 @@ uint64_t slurm_container_plugin_find(pid_t pid) { jid_t jid; - if ((jid = _job_getjid(pid)) == (jid_t) -1) + if ((jid = job_getjid(pid)) == (jid_t) -1) return ((uint64_t) 0); return ((uint64_t) jid); @@ -321,7 +226,7 @@ bool slurm_container_plugin_has_pid (uint64_t cont_id, pid_t pid) { jid_t jid; - if ((jid = _job_getjid(pid)) == (jid_t) -1) + if ((jid = job_getjid(pid)) == (jid_t) -1) return false; if ((uint64_t)jid != cont_id) return false; @@ -332,7 +237,7 @@ bool slurm_container_plugin_has_pid (uint64_t cont_id, pid_t pid) int slurm_container_plugin_wait(uint64_t id) { int status; - if (_job_waitjid((jid_t) id, &status, 0) == (jid_t)-1) + if (job_waitjid((jid_t) id, &status, 0) == (jid_t)-1) return SLURM_ERROR; return SLURM_SUCCESS; @@ -343,7 +248,7 @@ int slurm_container_plugin_get_pids(uint64_t cont_id, pid_t **pids, int *npids) int pidcnt, bufsize; pid_t *p; - pidcnt = _job_getpidcnt((jid_t)cont_id); + pidcnt = job_getpidcnt((jid_t)cont_id); if (pidcnt > 0) { /* * FIXME - The "+ 128" is a rough attempt to allow for @@ -352,7 +257,7 @@ int slurm_container_plugin_get_pids(uint64_t cont_id, pid_t **pids, int *npids) */ bufsize = sizeof(pid_t) * (pidcnt + 128); p = (pid_t *)xmalloc(bufsize); - pidcnt = _job_getpidlist((jid_t)cont_id, p, bufsize); + pidcnt = job_getpidlist((jid_t)cont_id, p, bufsize); if (pidcnt == -1) { error("job_getpidlist() failed: %m"); *pids = NULL; diff --git a/src/plugins/select/bluegene/runjob_plugin.cc b/src/plugins/select/bluegene/runjob_plugin.cc index 8981436c5eab482a891d75ab14eba359f9e21e33..78c7568fa7c97780dd177dfb390004d970060d3f 100644 --- a/src/plugins/select/bluegene/runjob_plugin.cc +++ b/src/plugins/select/bluegene/runjob_plugin.cc @@ -143,8 +143,8 @@ static void _send_failed_cnodes(uint32_t job_id, uint32_t step_id, uint16_t sig) if ((count > max_tries) || rc == ESLURM_ALREADY_DONE || rc == ESLURM_INVALID_JOB_ID) break; - std::cerr << "Trying to fail cnodes, message from slurmctld: " - << slurm_strerror(rc) << std::endl; + LOG_WARN_MSG("Trying to fail cnodes, message from slurmctld: " + << slurm_strerror(rc)); sleep (5); count++; }