From 0a6f3068d59d4ef64da237acc75e2ecc38507d00 Mon Sep 17 00:00:00 2001
From: "Christopher J. Morrone" <morrone2@llnl.gov>
Date: Fri, 17 Nov 2006 22:20:50 +0000
Subject: [PATCH] svn merge -r10123:10186
 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1

Includes Mark Grondona's addition of the slurm_spank_local_user_init()
function, with my massaging to make it more compatible with the trunk code.
However, there is no support in the trunk API task launch code (and therefore
slaunch) for slurm_spank_local_user_init().  I will add that in a future
commit.
---
 NEWS                                  |  10 ++
 doc/man/man8/spank.8                  |  30 ++++-
 slurm/spank.h                         |  19 ++-
 src/common/plugstack.c                | 168 ++++++++++++++++++++------
 src/common/plugstack.h                |  12 ++
 src/plugins/sched/backfill/backfill.c |  12 +-
 src/slurmctld/job_mgr.c               |   8 +-
 src/slurmctld/ping_nodes.c            |   2 +-
 src/slurmctld/proc_req.c              |   2 +-
 src/slurmd/slurmstepd/req.c           |  19 ++-
 src/srun/allocate.c                   |   1 +
 src/srun/opt.c                        |   5 +-
 src/srun/srun.c                       |  20 +++
 13 files changed, 245 insertions(+), 63 deletions(-)

diff --git a/NEWS b/NEWS
index e672d7c305d..c990411820d 100644
--- a/NEWS
+++ b/NEWS
@@ -107,6 +107,12 @@ documents those changes that are of interest to users and admins.
     the code)
  -- Added support for OSX build.
 
+* Changes in SLURM 1.1.20
+=========================
+ - Added new SPANK plugin hook slurm_spank_local_user_init() called
+   from srun after node allocation.
+ - Fixed bug with hostfile support not working on a direct srun
+
 * Changes in SLURM 1.1.19
 =========================
  - BLUEGENE - make sure the order of blocks read in from the bluegene.conf
@@ -491,6 +497,10 @@ documents those changes that are of interest to users and admins.
  -- switch/elan: Fix bug in propagation of ELAN_STATKEY environment variable.
  -- Fix bug in slurmstepd IO code that can result in it spinning if a
     certain error occurs.
+ -- Remove nodes from srun's required node list if their count exceeds 
+    the number of requested tasks.
+ -- sched/backfill to schedule around jobs that are hung in a completing 
+    state.
 
 * Changes in SLURM 1.0.15
 =========================
diff --git a/doc/man/man8/spank.8 b/doc/man/man8/spank.8
index 4f35417b62c..c8f5b5cbd2c 100644
--- a/doc/man/man8/spank.8
+++ b/doc/man/man8/spank.8
@@ -19,11 +19,11 @@ behavior of SLURM job launch.
 .SH "SPANK PLUGINS"
 \fBSPANK\fR plugins are loaded in two separate contexts during a 
 \fBSLURM\fR job.  In "local" context, the plugin is loaded by \fBsrun\fR
-or other \fBSLURM\fR user interface. In local context, the plugin options
-are read by \fBSPANK\fR, and options are presented to the user. In 
-"remote" context, the plugin is running on a compute node of the job,
-in other words, the plugin is loaded by \fBslurmd\fR. Only the 
-\fBinit\fR and \fBexit\fR functions are called in local context.
+or other \fBSLURM\fR user interface. In local context, options provided by 
+plugins are read by \fBSPANK\fR, and these options are presented to the user. 
+In "remote" context, the plugin is loaded on a compute node of the job,
+in other words, the plugin is loaded by \fBslurmd\fR. In local context, only
+the \fBinit\fR, \fBexit\fR, and \fBuser_local_init\fR functions are called.
 Plugins may query the context in which they are running with the
 \fBspank_remote\fR function defined in \fB<slurm/spank.h>\fR.
 .LP
@@ -35,6 +35,9 @@ Called just after plugins are loaded. In remote context, this is
 just after job step is initialized. For local context, this is before
 user options are processed.
 .TP
+\fBslurm_spank_local_user_init\fR
+Called in local (srun) context only after all options have been processed.
+.TP
 \fBslurm_spank_user_init\fR 
 Called after privileges are temporarily dropped. (remote context only)
 .TP
@@ -68,6 +71,19 @@ SLURM when the plugin calls functions like \fBspank_get_item\fR and
 below) are passed in the argument vector \fBargv\fR with argument
 count \fBac\fR.
 .LP
+\fBSPANK\fR plugins can query the current list of supported slurm_spank\*
+symbols to determine if the current version supports a given plugin hook.
+This may be useful because the list of plugin symbols may grow in the
+future. The query is done using the \fBspank_symbol_supported\fR function,
+which has the following prototype:
+.nf
+
+    int \fBspank_symbol_supported\fR (const char *sym);
+
+.fi
+.LP
+The return value is 1 if the symbol is supported, 0 if not.
+.LP
 \fBSPANK\fR plugins do not have direct access to internally defined SLURM 
 data structures. Instead, information about the currently executing
 job is obtained via the \fBspank_get_item\fR function call.
@@ -118,6 +134,10 @@ the job's environment. The prototypes are:
  spank_err_t \fBspank_unsetenv\fR (spank_t spank, const char *var);
 .fi
 .LP
+These are only necessary in remote context since modifications of
+the standard process environment using \fBsetenv\fR(3), \fBgetenv\fR(3),
+and \fBunsetenv\fR(3) may be used in local context.
+.LP
 See \fBspank.h\fR for more information, and \fBEXAMPLES\fR below for an example
 for \fBspank_getenv\fR usage.
 .SH "SPANK OPTIONS"
diff --git a/slurm/spank.h b/slurm/spank.h
index d31fb6bb3bb..e32c6e7f4d9 100644
--- a/slurm/spank.h
+++ b/slurm/spank.h
@@ -77,9 +77,12 @@ typedef int (spank_f) (spank_t spank, int ac, char *argv[]);
  *               |          `-> task_exit ()
  *               `-> fini ()
  *
+ *   In srun only the init() and local_user_init() callbacks are used.
+ *
  */
 
 extern spank_f slurm_spank_init;
+extern spank_f slurm_spank_local_user_init;
 extern spank_f slurm_spank_user_init;
 extern spank_f slurm_spank_task_init;
 extern spank_f slurm_spank_task_post_fork;
@@ -188,6 +191,17 @@ extern struct spank_option spank_options [];
  */
 BEGIN_C_DECLS
 
+/*
+ *  Determine whether a given spank plugin symbol is supported
+ *   in this version of SPANK interface.
+ *
+ *  Returns:
+ *  = 1   The symbol is supported
+ *  = 0   The symbol is not supported
+ *  = -1  Invalid argument
+ */
+int spank_symbol_supported (const char *symbol);
+
 /*
  *  Determine whether plugin is loaded "local" or "remote."
  * 
@@ -207,8 +221,9 @@ int spank_remote (spank_t spank);
  *   
  *  Returns ESPANK_SUCCESS on success, ESPANK_NOTASK if an S_TASK*
  *   item is requested from outside a task context, ESPANK_BAD_ARG
- *   if invalid args are passed to spank_get_item, and 
- *   ESPANK_NOT_REMOTE if not called from slurmd context.
+ *   if invalid args are passed to spank_get_item or spank_get_item
+ *   is called from an invalid context, and ESPANK_NOT_REMOTE 
+ *   if not called from slurmd context or spank_user_local_init.
  */
 spank_err_t spank_get_item (spank_t spank, spank_item_t item, ...);
 
diff --git a/src/common/plugstack.c b/src/common/plugstack.c
index 8f5c87f6b99..5899b76cfa9 100644
--- a/src/common/plugstack.c
+++ b/src/common/plugstack.c
@@ -53,6 +53,7 @@
 #include "src/common/job_options.h"
 
 #include "src/slurmd/slurmstepd/slurmstepd_job.h"
+#include "src/srun/srun_job.h"
 
 #include <slurm/spank.h>
 
@@ -61,6 +62,7 @@
 
 struct spank_plugin_operations {
 	spank_f *init;
+	spank_f *local_user_init;
 	spank_f *user_init;
 	spank_f *user_task_init;
 	spank_f *task_post_fork;
@@ -68,9 +70,10 @@ struct spank_plugin_operations {
 	spank_f *exit;
 };
 
-const int n_spank_syms = 6;
+const int n_spank_syms = 7;
 const char *spank_syms[] = {
 	"slurm_spank_init",
+	"slurm_spank_local_user_init",
 	"slurm_spank_user_init",
 	"slurm_spank_task_init",
 	"slurm_spank_task_post_fork",
@@ -127,6 +130,7 @@ typedef enum spank_handle_type {
  */
 typedef enum step_fn {
 	SPANK_INIT = 0,
+	LOCAL_USER_INIT,
 	STEP_USER_INIT,
 	STEP_USER_TASK_INIT,
 	STEP_TASK_POST_FORK,
@@ -139,11 +143,10 @@ struct spank_handle {
 	int                  magic;  /* Magic identifier to ensure validity. */
 	spank_handle_type_t  type;   /* remote(slurmd) || local(srun)        */
 	step_fn_t            phase;  /* Which spank fn are we called from?   */
-	slurmd_job_t *       job;    /* Reference to current slurmd job      */
+	void               * job;    /* Reference to current srun|slurmd job */
 	slurmd_task_info_t * task;   /* Reference to current task (if valid) */
 };
 
-
 /*
  *  SPANK plugins stack
  */
@@ -412,7 +415,7 @@ static int _spank_stack_create(const char *path, List * listp)
 }
 
 static int
-_spank_handle_init(struct spank_handle *spank, slurmd_job_t * job,
+_spank_handle_init(struct spank_handle *spank, void * arg,
 		   int taskid, step_fn_t fn)
 {
 	memset(spank, 0, sizeof(*spank));
@@ -420,11 +423,15 @@ _spank_handle_init(struct spank_handle *spank, slurmd_job_t * job,
 
 	spank->phase = fn;
 
-	if (job != NULL) {
-		spank->type = S_TYPE_REMOTE;
-		spank->job = job;
-		if (taskid >= 0)
-			spank->task = job->task[taskid];
+	if (arg != NULL) {
+		spank->job = arg;
+		if (fn == LOCAL_USER_INIT)
+			spank->type = S_TYPE_LOCAL;
+		else {
+			spank->type = S_TYPE_REMOTE;
+			if (taskid >= 0)
+				spank->task = ((slurmd_job_t *) arg)->task[taskid];
+		}
 	} else {
 		spank->type = S_TYPE_LOCAL;
 	}
@@ -436,6 +443,8 @@ static const char *_step_fn_name(step_fn_t type)
 	switch (type) {
 	case SPANK_INIT:
 		return ("init");
+	case LOCAL_USER_INIT:
+		return ("local_user_init");
 	case STEP_USER_INIT:
 		return ("user_init");
 	case STEP_USER_TASK_INIT:
@@ -452,7 +461,7 @@ static const char *_step_fn_name(step_fn_t type)
 	return ("unknown");
 }
 
-static int _do_call_stack(step_fn_t type, slurmd_job_t * job, int taskid)
+static int _do_call_stack(step_fn_t type, void * job, int taskid)
 {
 	int rc = 0;
 	ListIterator i;
@@ -483,6 +492,14 @@ static int _do_call_stack(step_fn_t type, slurmd_job_t * job, int taskid)
 				       fn_name, rc);
 			}
 			break;
+		case LOCAL_USER_INIT:
+			if (sp->ops.local_user_init) {
+				rc = (*sp->ops.local_user_init) (spank, sp->ac, 
+			 				         sp->argv);
+				debug2("spank: %s: %s = %d\n", name,
+						fn_name, rc);
+			}
+			break;
 		case STEP_USER_INIT:
 			if (sp->ops.user_init) {
 				rc = (*sp->ops.user_init) (spank, sp->ac,
@@ -566,11 +583,17 @@ int spank_init(slurmd_job_t * job)
 	return (0);
 }
 
+
 int spank_user(slurmd_job_t * job)
 {
 	return (_do_call_stack(STEP_USER_INIT, job, -1));
 }
 
+int spank_local_user(struct spank_launcher_job_info *job)
+{
+	return (_do_call_stack(LOCAL_USER_INIT, job, -1));
+}
+
 int spank_user_task(slurmd_job_t * job, int taskid)
 {
 	return (_do_call_stack(STEP_USER_TASK_INIT, job, taskid));
@@ -980,10 +1003,49 @@ global_to_local_id (slurmd_job_t *job, uint32_t gid, uint32_t *p2uint32)
 }
 	
 
+/*
+ *  Return 1 if spank_item_t is valid for S_TYPE_LOCAL
+ */
+static int valid_in_local_context (spank_item_t item)
+{
+	int rc = 0;
+	switch (item) {
+	case S_JOB_UID: 
+	case S_JOB_GID:
+	case S_JOB_ID:
+	case S_JOB_STEPID:
+	case S_JOB_ARGV:
+	case S_JOB_ENV:
+	case S_JOB_TOTAL_TASK_COUNT:
+	case S_JOB_NNODES:
+		rc = 1;
+		break;
+	default:
+		rc = 0;
+	}
+	return (rc);
+}
+
+
 /*
  *  Global functions for SPANK plugins
  */
 
+int spank_symbol_supported (const char *name)
+{
+	int i;
+
+	if ((name == NULL))
+		return (-1);
+
+	for (i = 0; i < n_spank_syms; i++) {
+		if (strcmp (spank_syms [i], name) == 0)
+			return (1);
+	}
+
+	return (0);
+}
+
 int spank_remote(spank_t spank)
 {
 	if ((spank == NULL) || (spank->magic != SPANK_MAGIC))
@@ -1007,70 +1069,102 @@ spank_err_t spank_get_item(spank_t spank, spank_item_t item, ...)
 	pid_t  pid;
 	char ***p2argv;
 	slurmd_task_info_t *task;
-	va_list vargs; spank_err_t rc = ESPANK_SUCCESS;
+	slurmd_job_t  *slurmd_job = NULL;
+	struct spank_launcher_job_info *launcher_job = NULL;
+	va_list vargs; 
+	spank_err_t rc = ESPANK_SUCCESS;
 
 	if ((spank == NULL) || (spank->magic != SPANK_MAGIC))
 		return (ESPANK_BAD_ARG);
 
-	if (spank->type != S_TYPE_REMOTE)
+	if ( (spank->type != S_TYPE_REMOTE) 
+	  && (!valid_in_local_context(item)))
 		return (ESPANK_NOT_REMOTE);
 
 	if (spank->job == NULL)
 		return (ESPANK_BAD_ARG);
 
+	if (spank->type == S_TYPE_LOCAL)
+		launcher_job = spank->job;
+	else
+		slurmd_job = spank->job;
+
 	va_start(vargs, item);
 	switch (item) {
 	case S_JOB_UID:
 		p2uid = va_arg(vargs, uid_t *);
-		*p2uid = spank->job->uid;
+		if (spank->type == S_TYPE_LOCAL)
+			*p2uid = launcher_job->uid;
+		else
+			*p2uid = slurmd_job->uid;
 		break;
 	case S_JOB_GID:
 		p2gid = va_arg(vargs, gid_t *);
-		*p2gid = spank->job->gid;
+		if (spank->type == S_TYPE_LOCAL)
+			*p2gid = launcher_job->gid;
+		else
+			*p2gid = slurmd_job->gid;
 		break;
 	case S_JOB_SUPPLEMENTARY_GIDS:
 		p2gids = va_arg(vargs, gid_t **);
 		p2int = va_arg(vargs, int *);
-		*p2gids = spank->job->gids;
-		*p2int = spank->job->ngids;
+		*p2gids = slurmd_job->gids;
+		*p2int = slurmd_job->ngids;
 		break;
 	case S_JOB_ID:
 		p2uint32 = va_arg(vargs, uint32_t *);
-		*p2uint32 = spank->job->jobid;
+		if (spank->type == S_TYPE_LOCAL)
+			*p2uint32 = launcher_job->jobid;
+		else
+			*p2uint32 = slurmd_job->jobid;
 		break;
 	case S_JOB_STEPID:
 		p2uint32 = va_arg(vargs, uint32_t *);
-		*p2uint32 = spank->job->stepid;
+		if (spank->type == S_TYPE_LOCAL)
+			*p2uint32 = launcher_job->stepid;
+		else
+			*p2uint32 = slurmd_job->stepid;
 		break;
 	case S_JOB_NNODES:
 		p2uint32 = va_arg(vargs, uint32_t *);
-		*p2uint32 = spank->job->nnodes;
+		if (spank->type == S_TYPE_LOCAL)
+			*p2uint32 = launcher_job->step_layout->node_cnt;
+		else
+			*p2uint32 = slurmd_job->nnodes;
 		break;
 	case S_JOB_NODEID:
 		p2uint32 = va_arg(vargs, uint32_t *);
-		*p2uint32 = spank->job->nodeid;
+		*p2uint32 = slurmd_job->nodeid;
 		break;
 	case S_JOB_LOCAL_TASK_COUNT:
 		p2uint32 = va_arg(vargs, uint32_t *);
-		*p2uint32 = spank->job->ntasks;
+		*p2uint32 = slurmd_job->ntasks;
 		break;
 	case S_JOB_TOTAL_TASK_COUNT:
 		p2uint32 = va_arg(vargs, uint32_t *);
-		*p2uint32 = spank->job->nprocs;
+		if (spank->type == S_TYPE_LOCAL)
+			*p2uint32 = launcher_job->step_layout->task_cnt;
+		else
+			*p2uint32 = slurmd_job->nprocs;
 		break;
 	case S_JOB_NCPUS:
 		p2uint16 = va_arg(vargs, uint16_t *);
-		*p2uint16 = spank->job->cpus;
+		*p2uint16 = slurmd_job->cpus;
 		break;
 	case S_JOB_ARGV:
 		p2int = va_arg(vargs, int *);
-		*p2int = spank->job->argc;
 		p2argv = va_arg(vargs, char ***);
-		*p2argv = spank->job->argv;
+		if (spank->type == S_TYPE_LOCAL) {
+			*p2int = launcher_job->argc;
+			*p2argv = launcher_job->argv;
+		} else {
+			*p2int = slurmd_job->argc;
+			*p2argv = slurmd_job->argv;
+		}
 		break;
 	case S_JOB_ENV:
 		p2argv = va_arg(vargs, char ***);
-		*p2argv = spank->job->env;
+		*p2argv = slurmd_job->env;
 		break;
 	case S_TASK_ID:
 		p2int = va_arg(vargs, int *);
@@ -1113,7 +1207,7 @@ spank_err_t spank_get_item(spank_t spank, spank_item_t item, ...)
 
 		if (!tasks_execd(spank))
 			rc = ESPANK_NOT_EXECD;
-		else if (!(task = job_task_info_by_pid (spank->job, pid)))
+		else if (!(task = job_task_info_by_pid (slurmd_job, pid)))
 			rc = ESPANK_NOEXIST;
 		else 
 			*p2uint32 = task->gtid;
@@ -1125,7 +1219,7 @@ spank_err_t spank_get_item(spank_t spank, spank_item_t item, ...)
 
 		if (!tasks_execd(spank))
 			rc = ESPANK_NOT_EXECD;
-		else if (!(task = job_task_info_by_pid (spank->job, pid)))
+		else if (!(task = job_task_info_by_pid (slurmd_job, pid)))
 			rc = ESPANK_NOEXIST;
 		else 
 			*p2uint32 = task->id;
@@ -1135,15 +1229,15 @@ spank_err_t spank_get_item(spank_t spank, spank_item_t item, ...)
 		p2uint32 = va_arg(vargs, uint32_t *);
 		*p2uint32 = (uint32_t) -1;
 
-		if (uint32 <= spank->job->ntasks) 
-			*p2uint32 = spank->job->task[uint32]->gtid;
+		if (uint32 <= slurmd_job->ntasks) 
+			*p2uint32 = slurmd_job->task[uint32]->gtid;
 		else 
 			rc = ESPANK_NOEXIST;
 		break;
 	case S_JOB_GLOBAL_TO_LOCAL_ID:
 		uint32 = va_arg(vargs, uint32_t);
 		p2uint32 = va_arg(vargs, uint32_t *);
-		rc = global_to_local_id (spank->job, uint32, p2uint32);
+		rc = global_to_local_id (slurmd_job, uint32, p2uint32);
 		break;
 	default:
 		rc = ESPANK_BAD_ARG;
@@ -1170,7 +1264,7 @@ spank_err_t spank_getenv(spank_t spank, const char *var, char *buf,
 	if (len < 0)
 		return (ESPANK_BAD_ARG);
 
-	if (!(val = getenvp(spank->job->env, var)))
+	if (!(val = getenvp(((slurmd_job_t *) spank->job)->env, var)))
 		return (ESPANK_ENV_NOEXIST);
 
 	if (strlcpy(buf, val, len) >= len)
@@ -1182,6 +1276,8 @@ spank_err_t spank_getenv(spank_t spank, const char *var, char *buf,
 spank_err_t spank_setenv(spank_t spank, const char *var, const char *val,
 			 int overwrite)
 {
+	slurmd_job_t * job;
+
 	if ((spank == NULL) || (spank->magic != SPANK_MAGIC))
 		return (ESPANK_BAD_ARG);
 
@@ -1194,10 +1290,12 @@ spank_err_t spank_setenv(spank_t spank, const char *var, const char *val,
 	if ((var == NULL) || (val == NULL))
 		return (ESPANK_BAD_ARG);
 
-	if (getenvp(spank->job->env, var) && !overwrite)
+	job = spank->job;
+
+	if (getenvp(job->env, var) && !overwrite)
 		return (ESPANK_ENV_EXISTS);
 
-	if (setenvf(&spank->job->env, var, "%s", val) < 0)
+	if (setenvf(&job->env, var, "%s", val) < 0)
 		return (ESPANK_ERROR);
 
 	return (ESPANK_SUCCESS);
@@ -1217,7 +1315,7 @@ spank_err_t spank_unsetenv (spank_t spank, const char *var)
 	if (var == NULL)
 		return (ESPANK_BAD_ARG);
 
-	unsetenvp(spank->job->env, var);
+	unsetenvp(((slurmd_job_t *) spank->job)->env, var);
 	
 	return (ESPANK_SUCCESS);
 }
diff --git a/src/common/plugstack.h b/src/common/plugstack.h
index 9c5bc9441e3..3a10e683449 100644
--- a/src/common/plugstack.h
+++ b/src/common/plugstack.h
@@ -51,10 +51,22 @@
 #include "src/common/job_options.h"
 #include "src/slurmd/slurmstepd/slurmstepd_job.h"
 
+struct spank_launcher_job_info {
+	uid_t       uid;
+	gid_t       gid;
+	uint32_t    jobid;
+	uint32_t    stepid;
+	slurm_step_layout_t *step_layout;
+	int         argc;
+	char      **argv;
+};
+
 int spank_init (slurmd_job_t *job);
 
 int spank_user (slurmd_job_t *job);
 
+int spank_local_user (struct spank_launcher_job_info *job);
+
 int spank_user_task (slurmd_job_t *job, int taskid);
 
 int spank_task_post_fork (slurmd_job_t *job, int taskid);
diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c
index 404564b1d28..b8440228a48 100644
--- a/src/plugins/sched/backfill/backfill.c
+++ b/src/plugins/sched/backfill/backfill.c
@@ -264,12 +264,13 @@ _has_state_changed(void)
 static void 
 _attempt_backfill(struct part_record *part_ptr)
 {
-	int i, error_code = 0;
+	int i, cg_hung = 0, error_code = 0;
 	uint32_t max_pending_prio = 0;
 	uint32_t min_pend_job_size = INFINITE;
 	struct job_record *job_ptr;
 	ListIterator job_iterator;
 	part_specs_t part_specs;
+	time_t now = time(NULL);
 
 #if __DEBUG
 	info("backfill: attempt on partition %s", part_ptr->name);
@@ -289,6 +290,13 @@ _attempt_backfill(struct part_record *part_ptr)
 			continue;	/* job in different partition */
 
 		if (job_ptr->job_state & JOB_COMPLETING) {
+			long wait_time = (long) difftime(now, job_ptr->end_time);
+			if (wait_time > 600) {
+				/* Job has been in completing state for 
+				 * >10 minutes, try to schedule around it */
+				cg_hung++;
+				continue;
+			}
 #if __DEBUG
 			info("backfill: Job %u completing, skip partition", 
 					job_ptr->job_id);
@@ -315,7 +323,7 @@ _attempt_backfill(struct part_record *part_ptr)
 	if (error_code) 
 		goto cleanup;
 
-	i = list_count(run_job_list);
+	i = list_count(run_job_list) + cg_hung;
 	if ( (i == 0) || (i > MAX_JOB_CNT) )
 		goto cleanup;		/* no running jobs or already have many */
 	if (list_is_empty(pend_job_list))
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 468cd0b88fe..e4148d02fe1 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -1020,7 +1020,7 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test)
 			if ((job_ptr->details == NULL) ||
 			    (job_ptr->kill_on_node_fail) ||
 			    (job_ptr->node_cnt <= 1)) {
-				error("Killing job_id %u on failed node %s",
+				info("Killing job_id %u on failed node %s",
 				      job_ptr->job_id, node_name);
 				job_ptr->job_state = JOB_NODE_FAIL | 
 						     JOB_COMPLETING;
@@ -2999,9 +2999,11 @@ void reset_job_bitmaps(void)
 
 		_reset_step_bitmaps(job_ptr);
 
-		if ((job_ptr->kill_on_step_done) &&
-		    (list_count(job_ptr->step_list) <= 1))
+		if ((job_ptr->kill_on_step_done)
+		&&  (list_count(job_ptr->step_list) <= 1)) {
+			info("Single job step done, job is complete");
 			job_fail = true;
+		}
 
 		if (job_fail) {
 			if (job_ptr->job_state == JOB_PENDING) {
diff --git a/src/slurmctld/ping_nodes.c b/src/slurmctld/ping_nodes.c
index af18b3ee43a..4c4a1b0a3a8 100644
--- a/src/slurmctld/ping_nodes.c
+++ b/src/slurmctld/ping_nodes.c
@@ -230,7 +230,7 @@ void ping_nodes (void)
 
 		/* Do not keep pinging down nodes since this can induce
 		 * huge delays in hierarchical communication fail-over */
-		if (no_resp_flag)
+		if ((no_resp_flag) && (base_state == NODE_STATE_DOWN))
 			continue;
 
 		hostlist_push(ping_agent_args->hostlist, node_ptr->name);
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index e8dbea7fa95..c74d68ee29d 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -1702,7 +1702,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 				     slurm_strerror(error_code));
 				slurm_send_rc_msg(msg, error_code);
 			} else {
-				info("_slurm_rpc_submit_batch_job JobId=%u %s",
+				info("_launch_batch_step JobId=%u %s",
 					job_desc_msg->job_id, TIME_STR);
 				submit_msg.job_id     = job_desc_msg->job_id;
 				submit_msg.step_id    = step_id;
diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c
index a3abc5d7eef..edda3429f6d 100644
--- a/src/slurmd/slurmstepd/req.c
+++ b/src/slurmd/slurmstepd/req.c
@@ -560,9 +560,8 @@ _handle_signal_process_group(int fd, slurmd_job_t *job, uid_t uid)
 
 	if (killpg(job->pgid, signal) == -1) {
 		rc = -1;
-		verbose("Error sending signal %d to %u.%u, pgid %d: %s", 
-			signal, job->jobid, job->stepid, job->pgid,
-			slurm_strerror(rc));
+		verbose("Error sending signal %d to %u.%u, pgid %d: %m", 
+			signal, job->jobid, job->stepid, job->pgid);
 	} else {
 		verbose("Sent signal %d to %u.%u, pgid %d", 
 			signal, job->jobid, job->stepid, job->pgid);
@@ -634,9 +633,9 @@ _handle_signal_task_local(int fd, slurmd_job_t *job, uid_t uid)
 
 	if (kill(job->task[ltaskid]->pid, signal) == -1) {
 		rc = -1;
-		verbose("Error sending signal %d to %u.%u, pid %d: %s", 
+		verbose("Error sending signal %d to %u.%u, pid %d: %m", 
 			signal, job->jobid, job->stepid,
-			job->task[ltaskid]->pid, slurm_strerror(rc));
+			job->task[ltaskid]->pid);
 	} else {
 		verbose("Sent signal %d to %u.%u, pid %d", 
 			signal, job->jobid, job->stepid,
@@ -699,9 +698,8 @@ _handle_signal_container(int fd, slurmd_job_t *job, uid_t uid)
 	if (slurm_container_signal(job->cont_id, signal) < 0) {
 		rc = -1;
 		errnum = errno;
-		verbose("Error sending signal %d to %u.%u: %s", 
-			signal, job->jobid, job->stepid, 
-			slurm_strerror(rc));
+		verbose("Error sending signal %d to %u.%u: %m", 
+			signal, job->jobid, job->stepid);
 	} else {
 		verbose("Sent signal %d to %u.%u", 
 			signal, job->jobid, job->stepid);
@@ -759,9 +757,8 @@ _handle_terminate(int fd, slurmd_job_t *job, uid_t uid)
 	if (slurm_container_signal(job->cont_id, SIGKILL) < 0) {
 		rc = -1;
 		errnum = errno;
-		verbose("Error sending signal %d to %u.%u: %s", 
-			SIGKILL, job->jobid, job->stepid, 
-			slurm_strerror(rc));
+		verbose("Error sending signal %d to %u.%u: %m", 
+			SIGKILL, job->jobid, job->stepid);
 	} else {
 		verbose("Sent signal %d to %u.%u", 
 			signal, job->jobid, job->stepid);
diff --git a/src/srun/allocate.c b/src/srun/allocate.c
index c45352bca9b..96d84852dcf 100644
--- a/src/srun/allocate.c
+++ b/src/srun/allocate.c
@@ -413,6 +413,7 @@ job_desc_msg_create_from_opts (char *script)
 			} else {
 				debug("loading nodes from hostfile %s",
 				      hostfile);
+				opt.nodelist = xstrdup(nodelist);
 				j->req_nodes = xstrdup(nodelist);
 				free(nodelist);
 				opt.distribution = SLURM_DIST_ARBITRARY;
diff --git a/src/srun/opt.c b/src/srun/opt.c
index 0df946e51e0..65279b9e96d 100644
--- a/src/srun/opt.c
+++ b/src/srun/opt.c
@@ -2376,8 +2376,7 @@ _create_path_list(void)
 
 	if (!path) {
 		error("Error in PATH environment variable");
-		list_destroy(l);
-		return NULL;
+		return l;
 	}
 
 	c = lc = path;
@@ -2679,7 +2678,7 @@ static void _help(void)
 "  -l, --label                 prepend task number to lines of stdout/err\n"
 "  -u, --unbuffered            do not line-buffer stdout/err\n"
 "  -m, --distribution=type     distribution method for processes to nodes\n"
-"                              (type = block|cyclic|hostfile)\n"
+"                              (type = block|cyclic|arbitrary)\n"
 "  -J, --job-name=jobname      name of job\n"
 "      --jobid=id              run under already allocated job\n"
 "      --mpi=type              type of MPI being used\n"
diff --git a/src/srun/srun.c b/src/srun/srun.c
index 1f1d8a2641a..679e9f79c7c 100644
--- a/src/srun/srun.c
+++ b/src/srun/srun.c
@@ -124,6 +124,7 @@ static void  _run_srun_epilog (srun_job_t *job);
 static int   _run_srun_script (srun_job_t *job, char *script);
 static int   _change_rlimit_rss(void);
 static int   _slurm_debug_env_val (void);
+static int   _call_spank_local_user (srun_job_t *job);
 
 int srun(int ac, char **av)
 {
@@ -324,6 +325,9 @@ int srun(int ac, char **av)
 
 	/* job structure should now be filled in */
 
+	if (_call_spank_local_user (job) < 0)
+		job_fatal(job, "Failure in local plugin stack");
+
 	/*
 	 *  Enhance environment for job
 	 */
@@ -459,6 +463,22 @@ int srun(int ac, char **av)
 	exit(exitcode);
 }
 
+static int _call_spank_local_user (srun_job_t *job)
+{
+	struct spank_launcher_job_info info[1];
+
+	info->uid = opt.uid;
+	info->gid = opt.gid;
+	info->jobid = job->jobid;
+	info->stepid = job->stepid;
+	info->step_layout = job->step_layout;	
+	info->argc = remote_argc;
+	info->argv = remote_argv;
+
+	return spank_local_user(info);
+}
+
+
 static int _slurm_debug_env_val (void)
 {
 	long int level = 0;
-- 
GitLab