From 0a6f3068d59d4ef64da237acc75e2ecc38507d00 Mon Sep 17 00:00:00 2001 From: "Christopher J. Morrone" <morrone2@llnl.gov> Date: Fri, 17 Nov 2006 22:20:50 +0000 Subject: [PATCH] svn merge -r10123:10186 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1 Includes Mark Grondona's addition of the slurm_spank_local_user_init() function, with my massaging to make it more compatible with the trunk code. However, there is no support in the trunk API task launch code (and therefore slaunch) for slurm_spank_local_user_init(). I will add that in a future commit. --- NEWS | 10 ++ doc/man/man8/spank.8 | 30 ++++- slurm/spank.h | 19 ++- src/common/plugstack.c | 168 ++++++++++++++++++++------ src/common/plugstack.h | 12 ++ src/plugins/sched/backfill/backfill.c | 12 +- src/slurmctld/job_mgr.c | 8 +- src/slurmctld/ping_nodes.c | 2 +- src/slurmctld/proc_req.c | 2 +- src/slurmd/slurmstepd/req.c | 19 ++- src/srun/allocate.c | 1 + src/srun/opt.c | 5 +- src/srun/srun.c | 20 +++ 13 files changed, 245 insertions(+), 63 deletions(-) diff --git a/NEWS b/NEWS index e672d7c305d..c990411820d 100644 --- a/NEWS +++ b/NEWS @@ -107,6 +107,12 @@ documents those changes that are of interest to users and admins. the code) -- Added support for OSX build. +* Changes in SLURM 1.1.20 +========================= + - Added new SPANK plugin hook slurm_spank_local_user_init() called + from srun after node allocation. + - Fixed bug with hostfile support not working on a direct srun + * Changes in SLURM 1.1.19 ========================= - BLUEGENE - make sure the order of blocks read in from the bluegene.conf @@ -491,6 +497,10 @@ documents those changes that are of interest to users and admins. -- switch/elan: Fix bug in propagation of ELAN_STATKEY environment variable. -- Fix bug in slurmstepd IO code that can result in it spinning if a certain error occurs. + -- Remove nodes from srun's required node list if their count exceeds + the number of requested tasks. + -- sched/backfill to schedule around jobs that are hung in a completing + state. * Changes in SLURM 1.0.15 ========================= diff --git a/doc/man/man8/spank.8 b/doc/man/man8/spank.8 index 4f35417b62c..c8f5b5cbd2c 100644 --- a/doc/man/man8/spank.8 +++ b/doc/man/man8/spank.8 @@ -19,11 +19,11 @@ behavior of SLURM job launch. .SH "SPANK PLUGINS" \fBSPANK\fR plugins are loaded in two separate contexts during a \fBSLURM\fR job. In "local" context, the plugin is loaded by \fBsrun\fR -or other \fBSLURM\fR user interface. In local context, the plugin options -are read by \fBSPANK\fR, and options are presented to the user. In -"remote" context, the plugin is running on a compute node of the job, -in other words, the plugin is loaded by \fBslurmd\fR. Only the -\fBinit\fR and \fBexit\fR functions are called in local context. +or other \fBSLURM\fR user interface. In local context, options provided by +plugins are read by \fBSPANK\fR, and these options are presented to the user. +In "remote" context, the plugin is loaded on a compute node of the job, +in other words, the plugin is loaded by \fBslurmd\fR. In local context, only +the \fBinit\fR, \fBexit\fR, and \fBuser_local_init\fR functions are called. Plugins may query the context in which they are running with the \fBspank_remote\fR function defined in \fB<slurm/spank.h>\fR. .LP @@ -35,6 +35,9 @@ Called just after plugins are loaded. In remote context, this is just after job step is initialized. For local context, this is before user options are processed. .TP +\fBslurm_spank_local_user_init\fR +Called in local (srun) context only after all options have been processed. +.TP \fBslurm_spank_user_init\fR Called after privileges are temporarily dropped. (remote context only) .TP @@ -68,6 +71,19 @@ SLURM when the plugin calls functions like \fBspank_get_item\fR and below) are passed in the argument vector \fBargv\fR with argument count \fBac\fR. .LP +\fBSPANK\fR plugins can query the current list of supported slurm_spank\* +symbols to determine if the current version supports a given plugin hook. +This may be useful because the list of plugin symbols may grow in the +future. The query is done using the \fBspank_symbol_supported\fR function, +which has the following prototype: +.nf + + int \fBspank_symbol_supported\fR (const char *sym); + +.fi +.LP +The return value is 1 if the symbol is supported, 0 if not. +.LP \fBSPANK\fR plugins do not have direct access to internally defined SLURM data structures. Instead, information about the currently executing job is obtained via the \fBspank_get_item\fR function call. @@ -118,6 +134,10 @@ the job's environment. The prototypes are: spank_err_t \fBspank_unsetenv\fR (spank_t spank, const char *var); .fi .LP +These are only necessary in remote context since modifications of +the standard process environment using \fBsetenv\fR(3), \fBgetenv\fR(3), +and \fBunsetenv\fR(3) may be used in local context. +.LP See \fBspank.h\fR for more information, and \fBEXAMPLES\fR below for an example for \fBspank_getenv\fR usage. .SH "SPANK OPTIONS" diff --git a/slurm/spank.h b/slurm/spank.h index d31fb6bb3bb..e32c6e7f4d9 100644 --- a/slurm/spank.h +++ b/slurm/spank.h @@ -77,9 +77,12 @@ typedef int (spank_f) (spank_t spank, int ac, char *argv[]); * | `-> task_exit () * `-> fini () * + * In srun only the init() and local_user_init() callbacks are used. + * */ extern spank_f slurm_spank_init; +extern spank_f slurm_spank_local_user_init; extern spank_f slurm_spank_user_init; extern spank_f slurm_spank_task_init; extern spank_f slurm_spank_task_post_fork; @@ -188,6 +191,17 @@ extern struct spank_option spank_options []; */ BEGIN_C_DECLS +/* + * Determine whether a given spank plugin symbol is supported + * in this version of SPANK interface. + * + * Returns: + * = 1 The symbol is supported + * = 0 The symbol is not supported + * = -1 Invalid argument + */ +int spank_symbol_supported (const char *symbol); + /* * Determine whether plugin is loaded "local" or "remote." * @@ -207,8 +221,9 @@ int spank_remote (spank_t spank); * * Returns ESPANK_SUCCESS on success, ESPANK_NOTASK if an S_TASK* * item is requested from outside a task context, ESPANK_BAD_ARG - * if invalid args are passed to spank_get_item, and - * ESPANK_NOT_REMOTE if not called from slurmd context. + * if invalid args are passed to spank_get_item or spank_get_item + * is called from an invalid context, and ESPANK_NOT_REMOTE + * if not called from slurmd context or spank_user_local_init. */ spank_err_t spank_get_item (spank_t spank, spank_item_t item, ...); diff --git a/src/common/plugstack.c b/src/common/plugstack.c index 8f5c87f6b99..5899b76cfa9 100644 --- a/src/common/plugstack.c +++ b/src/common/plugstack.c @@ -53,6 +53,7 @@ #include "src/common/job_options.h" #include "src/slurmd/slurmstepd/slurmstepd_job.h" +#include "src/srun/srun_job.h" #include <slurm/spank.h> @@ -61,6 +62,7 @@ struct spank_plugin_operations { spank_f *init; + spank_f *local_user_init; spank_f *user_init; spank_f *user_task_init; spank_f *task_post_fork; @@ -68,9 +70,10 @@ struct spank_plugin_operations { spank_f *exit; }; -const int n_spank_syms = 6; +const int n_spank_syms = 7; const char *spank_syms[] = { "slurm_spank_init", + "slurm_spank_local_user_init", "slurm_spank_user_init", "slurm_spank_task_init", "slurm_spank_task_post_fork", @@ -127,6 +130,7 @@ typedef enum spank_handle_type { */ typedef enum step_fn { SPANK_INIT = 0, + LOCAL_USER_INIT, STEP_USER_INIT, STEP_USER_TASK_INIT, STEP_TASK_POST_FORK, @@ -139,11 +143,10 @@ struct spank_handle { int magic; /* Magic identifier to ensure validity. */ spank_handle_type_t type; /* remote(slurmd) || local(srun) */ step_fn_t phase; /* Which spank fn are we called from? */ - slurmd_job_t * job; /* Reference to current slurmd job */ + void * job; /* Reference to current srun|slurmd job */ slurmd_task_info_t * task; /* Reference to current task (if valid) */ }; - /* * SPANK plugins stack */ @@ -412,7 +415,7 @@ static int _spank_stack_create(const char *path, List * listp) } static int -_spank_handle_init(struct spank_handle *spank, slurmd_job_t * job, +_spank_handle_init(struct spank_handle *spank, void * arg, int taskid, step_fn_t fn) { memset(spank, 0, sizeof(*spank)); @@ -420,11 +423,15 @@ _spank_handle_init(struct spank_handle *spank, slurmd_job_t * job, spank->phase = fn; - if (job != NULL) { - spank->type = S_TYPE_REMOTE; - spank->job = job; - if (taskid >= 0) - spank->task = job->task[taskid]; + if (arg != NULL) { + spank->job = arg; + if (fn == LOCAL_USER_INIT) + spank->type = S_TYPE_LOCAL; + else { + spank->type = S_TYPE_REMOTE; + if (taskid >= 0) + spank->task = ((slurmd_job_t *) arg)->task[taskid]; + } } else { spank->type = S_TYPE_LOCAL; } @@ -436,6 +443,8 @@ static const char *_step_fn_name(step_fn_t type) switch (type) { case SPANK_INIT: return ("init"); + case LOCAL_USER_INIT: + return ("local_user_init"); case STEP_USER_INIT: return ("user_init"); case STEP_USER_TASK_INIT: @@ -452,7 +461,7 @@ static const char *_step_fn_name(step_fn_t type) return ("unknown"); } -static int _do_call_stack(step_fn_t type, slurmd_job_t * job, int taskid) +static int _do_call_stack(step_fn_t type, void * job, int taskid) { int rc = 0; ListIterator i; @@ -483,6 +492,14 @@ static int _do_call_stack(step_fn_t type, slurmd_job_t * job, int taskid) fn_name, rc); } break; + case LOCAL_USER_INIT: + if (sp->ops.local_user_init) { + rc = (*sp->ops.local_user_init) (spank, sp->ac, + sp->argv); + debug2("spank: %s: %s = %d\n", name, + fn_name, rc); + } + break; case STEP_USER_INIT: if (sp->ops.user_init) { rc = (*sp->ops.user_init) (spank, sp->ac, @@ -566,11 +583,17 @@ int spank_init(slurmd_job_t * job) return (0); } + int spank_user(slurmd_job_t * job) { return (_do_call_stack(STEP_USER_INIT, job, -1)); } +int spank_local_user(struct spank_launcher_job_info *job) +{ + return (_do_call_stack(LOCAL_USER_INIT, job, -1)); +} + int spank_user_task(slurmd_job_t * job, int taskid) { return (_do_call_stack(STEP_USER_TASK_INIT, job, taskid)); @@ -980,10 +1003,49 @@ global_to_local_id (slurmd_job_t *job, uint32_t gid, uint32_t *p2uint32) } +/* + * Return 1 if spank_item_t is valid for S_TYPE_LOCAL + */ +static int valid_in_local_context (spank_item_t item) +{ + int rc = 0; + switch (item) { + case S_JOB_UID: + case S_JOB_GID: + case S_JOB_ID: + case S_JOB_STEPID: + case S_JOB_ARGV: + case S_JOB_ENV: + case S_JOB_TOTAL_TASK_COUNT: + case S_JOB_NNODES: + rc = 1; + break; + default: + rc = 0; + } + return (rc); +} + + /* * Global functions for SPANK plugins */ +int spank_symbol_supported (const char *name) +{ + int i; + + if ((name == NULL)) + return (-1); + + for (i = 0; i < n_spank_syms; i++) { + if (strcmp (spank_syms [i], name) == 0) + return (1); + } + + return (0); +} + int spank_remote(spank_t spank) { if ((spank == NULL) || (spank->magic != SPANK_MAGIC)) @@ -1007,70 +1069,102 @@ spank_err_t spank_get_item(spank_t spank, spank_item_t item, ...) pid_t pid; char ***p2argv; slurmd_task_info_t *task; - va_list vargs; spank_err_t rc = ESPANK_SUCCESS; + slurmd_job_t *slurmd_job = NULL; + struct spank_launcher_job_info *launcher_job = NULL; + va_list vargs; + spank_err_t rc = ESPANK_SUCCESS; if ((spank == NULL) || (spank->magic != SPANK_MAGIC)) return (ESPANK_BAD_ARG); - if (spank->type != S_TYPE_REMOTE) + if ( (spank->type != S_TYPE_REMOTE) + && (!valid_in_local_context(item))) return (ESPANK_NOT_REMOTE); if (spank->job == NULL) return (ESPANK_BAD_ARG); + if (spank->type == S_TYPE_LOCAL) + launcher_job = spank->job; + else + slurmd_job = spank->job; + va_start(vargs, item); switch (item) { case S_JOB_UID: p2uid = va_arg(vargs, uid_t *); - *p2uid = spank->job->uid; + if (spank->type == S_TYPE_LOCAL) + *p2uid = launcher_job->uid; + else + *p2uid = slurmd_job->uid; break; case S_JOB_GID: p2gid = va_arg(vargs, gid_t *); - *p2gid = spank->job->gid; + if (spank->type == S_TYPE_LOCAL) + *p2gid = launcher_job->gid; + else + *p2gid = slurmd_job->gid; break; case S_JOB_SUPPLEMENTARY_GIDS: p2gids = va_arg(vargs, gid_t **); p2int = va_arg(vargs, int *); - *p2gids = spank->job->gids; - *p2int = spank->job->ngids; + *p2gids = slurmd_job->gids; + *p2int = slurmd_job->ngids; break; case S_JOB_ID: p2uint32 = va_arg(vargs, uint32_t *); - *p2uint32 = spank->job->jobid; + if (spank->type == S_TYPE_LOCAL) + *p2uint32 = launcher_job->jobid; + else + *p2uint32 = slurmd_job->jobid; break; case S_JOB_STEPID: p2uint32 = va_arg(vargs, uint32_t *); - *p2uint32 = spank->job->stepid; + if (spank->type == S_TYPE_LOCAL) + *p2uint32 = launcher_job->stepid; + else + *p2uint32 = slurmd_job->stepid; break; case S_JOB_NNODES: p2uint32 = va_arg(vargs, uint32_t *); - *p2uint32 = spank->job->nnodes; + if (spank->type == S_TYPE_LOCAL) + *p2uint32 = launcher_job->step_layout->node_cnt; + else + *p2uint32 = slurmd_job->nnodes; break; case S_JOB_NODEID: p2uint32 = va_arg(vargs, uint32_t *); - *p2uint32 = spank->job->nodeid; + *p2uint32 = slurmd_job->nodeid; break; case S_JOB_LOCAL_TASK_COUNT: p2uint32 = va_arg(vargs, uint32_t *); - *p2uint32 = spank->job->ntasks; + *p2uint32 = slurmd_job->ntasks; break; case S_JOB_TOTAL_TASK_COUNT: p2uint32 = va_arg(vargs, uint32_t *); - *p2uint32 = spank->job->nprocs; + if (spank->type == S_TYPE_LOCAL) + *p2uint32 = launcher_job->step_layout->task_cnt; + else + *p2uint32 = slurmd_job->nprocs; break; case S_JOB_NCPUS: p2uint16 = va_arg(vargs, uint16_t *); - *p2uint16 = spank->job->cpus; + *p2uint16 = slurmd_job->cpus; break; case S_JOB_ARGV: p2int = va_arg(vargs, int *); - *p2int = spank->job->argc; p2argv = va_arg(vargs, char ***); - *p2argv = spank->job->argv; + if (spank->type == S_TYPE_LOCAL) { + *p2int = launcher_job->argc; + *p2argv = launcher_job->argv; + } else { + *p2int = slurmd_job->argc; + *p2argv = slurmd_job->argv; + } break; case S_JOB_ENV: p2argv = va_arg(vargs, char ***); - *p2argv = spank->job->env; + *p2argv = slurmd_job->env; break; case S_TASK_ID: p2int = va_arg(vargs, int *); @@ -1113,7 +1207,7 @@ spank_err_t spank_get_item(spank_t spank, spank_item_t item, ...) if (!tasks_execd(spank)) rc = ESPANK_NOT_EXECD; - else if (!(task = job_task_info_by_pid (spank->job, pid))) + else if (!(task = job_task_info_by_pid (slurmd_job, pid))) rc = ESPANK_NOEXIST; else *p2uint32 = task->gtid; @@ -1125,7 +1219,7 @@ spank_err_t spank_get_item(spank_t spank, spank_item_t item, ...) if (!tasks_execd(spank)) rc = ESPANK_NOT_EXECD; - else if (!(task = job_task_info_by_pid (spank->job, pid))) + else if (!(task = job_task_info_by_pid (slurmd_job, pid))) rc = ESPANK_NOEXIST; else *p2uint32 = task->id; @@ -1135,15 +1229,15 @@ spank_err_t spank_get_item(spank_t spank, spank_item_t item, ...) p2uint32 = va_arg(vargs, uint32_t *); *p2uint32 = (uint32_t) -1; - if (uint32 <= spank->job->ntasks) - *p2uint32 = spank->job->task[uint32]->gtid; + if (uint32 <= slurmd_job->ntasks) + *p2uint32 = slurmd_job->task[uint32]->gtid; else rc = ESPANK_NOEXIST; break; case S_JOB_GLOBAL_TO_LOCAL_ID: uint32 = va_arg(vargs, uint32_t); p2uint32 = va_arg(vargs, uint32_t *); - rc = global_to_local_id (spank->job, uint32, p2uint32); + rc = global_to_local_id (slurmd_job, uint32, p2uint32); break; default: rc = ESPANK_BAD_ARG; @@ -1170,7 +1264,7 @@ spank_err_t spank_getenv(spank_t spank, const char *var, char *buf, if (len < 0) return (ESPANK_BAD_ARG); - if (!(val = getenvp(spank->job->env, var))) + if (!(val = getenvp(((slurmd_job_t *) spank->job)->env, var))) return (ESPANK_ENV_NOEXIST); if (strlcpy(buf, val, len) >= len) @@ -1182,6 +1276,8 @@ spank_err_t spank_getenv(spank_t spank, const char *var, char *buf, spank_err_t spank_setenv(spank_t spank, const char *var, const char *val, int overwrite) { + slurmd_job_t * job; + if ((spank == NULL) || (spank->magic != SPANK_MAGIC)) return (ESPANK_BAD_ARG); @@ -1194,10 +1290,12 @@ spank_err_t spank_setenv(spank_t spank, const char *var, const char *val, if ((var == NULL) || (val == NULL)) return (ESPANK_BAD_ARG); - if (getenvp(spank->job->env, var) && !overwrite) + job = spank->job; + + if (getenvp(job->env, var) && !overwrite) return (ESPANK_ENV_EXISTS); - if (setenvf(&spank->job->env, var, "%s", val) < 0) + if (setenvf(&job->env, var, "%s", val) < 0) return (ESPANK_ERROR); return (ESPANK_SUCCESS); @@ -1217,7 +1315,7 @@ spank_err_t spank_unsetenv (spank_t spank, const char *var) if (var == NULL) return (ESPANK_BAD_ARG); - unsetenvp(spank->job->env, var); + unsetenvp(((slurmd_job_t *) spank->job)->env, var); return (ESPANK_SUCCESS); } diff --git a/src/common/plugstack.h b/src/common/plugstack.h index 9c5bc9441e3..3a10e683449 100644 --- a/src/common/plugstack.h +++ b/src/common/plugstack.h @@ -51,10 +51,22 @@ #include "src/common/job_options.h" #include "src/slurmd/slurmstepd/slurmstepd_job.h" +struct spank_launcher_job_info { + uid_t uid; + gid_t gid; + uint32_t jobid; + uint32_t stepid; + slurm_step_layout_t *step_layout; + int argc; + char **argv; +}; + int spank_init (slurmd_job_t *job); int spank_user (slurmd_job_t *job); +int spank_local_user (struct spank_launcher_job_info *job); + int spank_user_task (slurmd_job_t *job, int taskid); int spank_task_post_fork (slurmd_job_t *job, int taskid); diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 404564b1d28..b8440228a48 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -264,12 +264,13 @@ _has_state_changed(void) static void _attempt_backfill(struct part_record *part_ptr) { - int i, error_code = 0; + int i, cg_hung = 0, error_code = 0; uint32_t max_pending_prio = 0; uint32_t min_pend_job_size = INFINITE; struct job_record *job_ptr; ListIterator job_iterator; part_specs_t part_specs; + time_t now = time(NULL); #if __DEBUG info("backfill: attempt on partition %s", part_ptr->name); @@ -289,6 +290,13 @@ _attempt_backfill(struct part_record *part_ptr) continue; /* job in different partition */ if (job_ptr->job_state & JOB_COMPLETING) { + long wait_time = (long) difftime(now, job_ptr->end_time); + if (wait_time > 600) { + /* Job has been in completing state for + * >10 minutes, try to schedule around it */ + cg_hung++; + continue; + } #if __DEBUG info("backfill: Job %u completing, skip partition", job_ptr->job_id); @@ -315,7 +323,7 @@ _attempt_backfill(struct part_record *part_ptr) if (error_code) goto cleanup; - i = list_count(run_job_list); + i = list_count(run_job_list) + cg_hung; if ( (i == 0) || (i > MAX_JOB_CNT) ) goto cleanup; /* no running jobs or already have many */ if (list_is_empty(pend_job_list)) diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 468cd0b88fe..e4148d02fe1 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1020,7 +1020,7 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test) if ((job_ptr->details == NULL) || (job_ptr->kill_on_node_fail) || (job_ptr->node_cnt <= 1)) { - error("Killing job_id %u on failed node %s", + info("Killing job_id %u on failed node %s", job_ptr->job_id, node_name); job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; @@ -2999,9 +2999,11 @@ void reset_job_bitmaps(void) _reset_step_bitmaps(job_ptr); - if ((job_ptr->kill_on_step_done) && - (list_count(job_ptr->step_list) <= 1)) + if ((job_ptr->kill_on_step_done) + && (list_count(job_ptr->step_list) <= 1)) { + info("Single job step done, job is complete"); job_fail = true; + } if (job_fail) { if (job_ptr->job_state == JOB_PENDING) { diff --git a/src/slurmctld/ping_nodes.c b/src/slurmctld/ping_nodes.c index af18b3ee43a..4c4a1b0a3a8 100644 --- a/src/slurmctld/ping_nodes.c +++ b/src/slurmctld/ping_nodes.c @@ -230,7 +230,7 @@ void ping_nodes (void) /* Do not keep pinging down nodes since this can induce * huge delays in hierarchical communication fail-over */ - if (no_resp_flag) + if ((no_resp_flag) && (base_state == NODE_STATE_DOWN)) continue; hostlist_push(ping_agent_args->hostlist, node_ptr->name); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index e8dbea7fa95..c74d68ee29d 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1702,7 +1702,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) slurm_strerror(error_code)); slurm_send_rc_msg(msg, error_code); } else { - info("_slurm_rpc_submit_batch_job JobId=%u %s", + info("_launch_batch_step JobId=%u %s", job_desc_msg->job_id, TIME_STR); submit_msg.job_id = job_desc_msg->job_id; submit_msg.step_id = step_id; diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c index a3abc5d7eef..edda3429f6d 100644 --- a/src/slurmd/slurmstepd/req.c +++ b/src/slurmd/slurmstepd/req.c @@ -560,9 +560,8 @@ _handle_signal_process_group(int fd, slurmd_job_t *job, uid_t uid) if (killpg(job->pgid, signal) == -1) { rc = -1; - verbose("Error sending signal %d to %u.%u, pgid %d: %s", - signal, job->jobid, job->stepid, job->pgid, - slurm_strerror(rc)); + verbose("Error sending signal %d to %u.%u, pgid %d: %m", + signal, job->jobid, job->stepid, job->pgid); } else { verbose("Sent signal %d to %u.%u, pgid %d", signal, job->jobid, job->stepid, job->pgid); @@ -634,9 +633,9 @@ _handle_signal_task_local(int fd, slurmd_job_t *job, uid_t uid) if (kill(job->task[ltaskid]->pid, signal) == -1) { rc = -1; - verbose("Error sending signal %d to %u.%u, pid %d: %s", + verbose("Error sending signal %d to %u.%u, pid %d: %m", signal, job->jobid, job->stepid, - job->task[ltaskid]->pid, slurm_strerror(rc)); + job->task[ltaskid]->pid); } else { verbose("Sent signal %d to %u.%u, pid %d", signal, job->jobid, job->stepid, @@ -699,9 +698,8 @@ _handle_signal_container(int fd, slurmd_job_t *job, uid_t uid) if (slurm_container_signal(job->cont_id, signal) < 0) { rc = -1; errnum = errno; - verbose("Error sending signal %d to %u.%u: %s", - signal, job->jobid, job->stepid, - slurm_strerror(rc)); + verbose("Error sending signal %d to %u.%u: %m", + signal, job->jobid, job->stepid); } else { verbose("Sent signal %d to %u.%u", signal, job->jobid, job->stepid); @@ -759,9 +757,8 @@ _handle_terminate(int fd, slurmd_job_t *job, uid_t uid) if (slurm_container_signal(job->cont_id, SIGKILL) < 0) { rc = -1; errnum = errno; - verbose("Error sending signal %d to %u.%u: %s", - SIGKILL, job->jobid, job->stepid, - slurm_strerror(rc)); + verbose("Error sending signal %d to %u.%u: %m", + SIGKILL, job->jobid, job->stepid); } else { verbose("Sent signal %d to %u.%u", signal, job->jobid, job->stepid); diff --git a/src/srun/allocate.c b/src/srun/allocate.c index c45352bca9b..96d84852dcf 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -413,6 +413,7 @@ job_desc_msg_create_from_opts (char *script) } else { debug("loading nodes from hostfile %s", hostfile); + opt.nodelist = xstrdup(nodelist); j->req_nodes = xstrdup(nodelist); free(nodelist); opt.distribution = SLURM_DIST_ARBITRARY; diff --git a/src/srun/opt.c b/src/srun/opt.c index 0df946e51e0..65279b9e96d 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -2376,8 +2376,7 @@ _create_path_list(void) if (!path) { error("Error in PATH environment variable"); - list_destroy(l); - return NULL; + return l; } c = lc = path; @@ -2679,7 +2678,7 @@ static void _help(void) " -l, --label prepend task number to lines of stdout/err\n" " -u, --unbuffered do not line-buffer stdout/err\n" " -m, --distribution=type distribution method for processes to nodes\n" -" (type = block|cyclic|hostfile)\n" +" (type = block|cyclic|arbitrary)\n" " -J, --job-name=jobname name of job\n" " --jobid=id run under already allocated job\n" " --mpi=type type of MPI being used\n" diff --git a/src/srun/srun.c b/src/srun/srun.c index 1f1d8a2641a..679e9f79c7c 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -124,6 +124,7 @@ static void _run_srun_epilog (srun_job_t *job); static int _run_srun_script (srun_job_t *job, char *script); static int _change_rlimit_rss(void); static int _slurm_debug_env_val (void); +static int _call_spank_local_user (srun_job_t *job); int srun(int ac, char **av) { @@ -324,6 +325,9 @@ int srun(int ac, char **av) /* job structure should now be filled in */ + if (_call_spank_local_user (job) < 0) + job_fatal(job, "Failure in local plugin stack"); + /* * Enhance environment for job */ @@ -459,6 +463,22 @@ int srun(int ac, char **av) exit(exitcode); } +static int _call_spank_local_user (srun_job_t *job) +{ + struct spank_launcher_job_info info[1]; + + info->uid = opt.uid; + info->gid = opt.gid; + info->jobid = job->jobid; + info->stepid = job->stepid; + info->step_layout = job->step_layout; + info->argc = remote_argc; + info->argv = remote_argv; + + return spank_local_user(info); +} + + static int _slurm_debug_env_val (void) { long int level = 0; -- GitLab