From f93df7c5127e98be7a173f87a9f36b1f36c92a8a Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 27 Nov 2007 22:27:27 +0000 Subject: [PATCH] svn merge -r12667:12698 https://eris.llnl.gov/svn/slurm/branches/slurm-1.2 --- NEWS | 3 ++ doc/man/man1/sbatch.1 | 26 ++++++++---- src/common/env.c | 13 ++++-- src/common/env.h | 11 +++-- src/plugins/sched/wiki/get_jobs.c | 68 +++++++++++++++++++++++++----- src/plugins/sched/wiki2/hostlist.c | 16 ++++--- src/sbatch/opt.c | 28 ++++++++++-- src/sbatch/opt.h | 3 +- src/sbatch/sbatch.c | 5 ++- src/slurmctld/step_mgr.c | 5 +-- 10 files changed, 138 insertions(+), 40 deletions(-) diff --git a/NEWS b/NEWS index 6d5c8e37bae..469d6e57512 100644 --- a/NEWS +++ b/NEWS @@ -113,6 +113,9 @@ documents those changes that are of interest to users and admins. the job. Only send it to specific nodes which have not reported completion. -- Support larger environment variables 64K instead of BUFSIZ (8k on some systems). + -- If a job is being requeued, job step create requests will print a + warning and repeatedly retry rather than aborting. + -- Add optional mode value to srun and sbatch --get-user-env option. * Changes in SLURM 1.2.19 ========================= diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index f2410b8c666..b4bd85ecfe6 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -174,15 +174,25 @@ The order of the node names in the list is not important; the node names will be sorted my SLURM. .TP -\fB\-\-get\-user\-env\fR[=\fItimeout\fR] +\fB\-\-get\-user\-env\fR[=\fItimeout\fR][\fImode\fR] This option will tell sbatch to retrieve the -login environment variables for the user specified in the \-\-uid option. -The environment variables are retrieved by running "su - <username> -c -/usr/bin/env" and parsing the output. Be aware that any environment -variables already set in sbatch's environment will take precedence over any -environment variables in the user's login environment. -Optional timeout value is in seconds. Default value is 8 seconds. -NOTE: This option only works if the caller has an effective uid of "root". +login environment variables for the user specified in the \fB\-\-uid\fR option. +The environment variables are retrieved by running something of this sort +"su - <username> -c /usr/bin/env" and parsing the output. +Be aware that any environment variables already set in sbatch's environment +will take precedence over any environment variables in the user's +login environment. +The optional \fItimeout\fR value is in seconds. Default value is 8 seconds. +The optional \fImode\fR value control the "su" options. +With a \fImode\fR value of "S", "su" is executed without the "\-" option. +With a \fImode\fR value of "L", "su" is executed with the "\-" option, +replicating the login environment. +If \fImode\fR not specified, the mode established at SLURM build time +is used. +Example of use include "\-\-get\-user\-env", "\-\-get\-user\-env=10" +"\-\-get\-user\-env=10L", and "\-\-get\-user\-env=S". +NOTE: This option only works if the caller has an +effective uid of "root". This option was originally created for use by Moab. .TP diff --git a/src/common/env.c b/src/common/env.c index 2741990550c..57827a3830d 100644 --- a/src/common/env.c +++ b/src/common/env.c @@ -1288,12 +1288,13 @@ char **_load_env_cache(const char *username) * in the event that option 1 times out. * * timeout value is in seconds or zero for default (8 secs) + * mode is 1 for short ("su <user>"), 2 for long ("su - <user>") * On error, returns NULL. * * NOTE: The calling process must have an effective uid of root for * this function to succeed. */ -char **env_array_user_default(const char *username, int timeout) +char **env_array_user_default(const char *username, int timeout, int mode) { FILE *su; char line[ENV_BUFSIZE]; @@ -1332,11 +1333,17 @@ char **env_array_user_default(const char *username, int timeout) snprintf(cmdstr, sizeof(cmdstr), "echo; echo; echo; echo %s; env; echo %s", starttoken, stoptoken); + if (mode == 1) + execl("/bin/su", "su", username, "-c", cmdstr, NULL); + else if (mode == 2) + execl("/bin/su", "su", "-", username, "-c", cmdstr, NULL); + else { /* Default system configuration */ #ifdef LOAD_ENV_NO_LOGIN - execl("/bin/su", "su", username, "-c", cmdstr, NULL); + execl("/bin/su", "su", username, "-c", cmdstr, NULL); #else - execl("/bin/su", "su", "-", username, "-c", cmdstr, NULL); + execl("/bin/su", "su", "-", username, "-c", cmdstr, NULL); #endif + } exit(1); } diff --git a/src/common/env.h b/src/common/env.h index c62e0054593..44c3def0439 100644 --- a/src/common/env.h +++ b/src/common/env.h @@ -241,15 +241,20 @@ void env_array_set_environment(char **env_array); /* * Return an array of strings representing the specified user's default - * environment variables, as determined by calling (more-or-less) - * "/bin/su - <username> -c /usr/bin/env". + * environment variables following a two-prongged approach. + * 1. Execute (more or less): "/bin/su - <username> -c /usr/bin/env" + * Depending upon the user's login scripts, this may take a very + * long time to complete or possibly never return + * 2. Load the user environment from a cache file. This is used + * in the event that option 1 times out. * * timeout value is in seconds or zero for default (8 secs) + * mode is 1 for short ("su <user>"), 2 for long ("su - <user>") * On error, returns NULL. * * NOTE: The calling process must have an effective uid of root for * this function to succeed. */ -char **env_array_user_default(const char *username, int timeout); +char **env_array_user_default(const char *username, int timeout, int mode); #endif diff --git a/src/plugins/sched/wiki/get_jobs.c b/src/plugins/sched/wiki/get_jobs.c index a5f63fa07f2..e021e05a847 100644 --- a/src/plugins/sched/wiki/get_jobs.c +++ b/src/plugins/sched/wiki/get_jobs.c @@ -39,6 +39,7 @@ #include <sys/types.h> #include "./msg.h" +#include "src/common/hostlist.h" #include "src/common/list.h" #include "src/common/uid.h" #include "src/slurmctld/locks.h" @@ -57,6 +58,8 @@ static uint32_t _get_job_submit_time(struct job_record *job_ptr); static uint32_t _get_job_suspend_time(struct job_record *job_ptr); static uint32_t _get_job_tasks(struct job_record *job_ptr); static uint32_t _get_job_time_limit(struct job_record *job_ptr); +static char * _task_list(struct job_record *job_ptr); + #define SLURM_INFO_ALL 0 #define SLURM_INFO_VOLITILE 1 @@ -210,8 +213,7 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) xstrcat(buf, tmp); xfree(hosts); } else if (!IS_JOB_FINISHED(job_ptr)) { - char *hosts = bitmap2wiki_node_name( - job_ptr->node_bitmap); + char *hosts = _task_list(job_ptr); snprintf(tmp, sizeof(tmp), "TASKLIST=%s;", hosts); xstrcat(buf, tmp); @@ -231,15 +233,13 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) (uint32_t) _get_job_time_limit(job_ptr)); xstrcat(buf, tmp); - if (job_ptr->job_state == JOB_PENDING) { - /* Don't report actual tasks or nodes allocated since - * this can impact requeue on heterogenous clusters */ - snprintf(tmp, sizeof(tmp), - "TASKS=%u;NODES=%u;", - _get_job_tasks(job_ptr), - _get_job_min_nodes(job_ptr)); - xstrcat(buf, tmp); - } + /* Don't report actual tasks or nodes allocated since + * this can impact requeue on heterogenous clusters */ + snprintf(tmp, sizeof(tmp), + "TASKS=%u;NODES=%u;", + _get_job_tasks(job_ptr), + _get_job_min_nodes(job_ptr)); + xstrcat(buf, tmp); snprintf(tmp, sizeof(tmp), "DPROCS=%u;", @@ -273,6 +273,18 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) xstrcat(buf, tmp); } + if (job_ptr->account) { + snprintf(tmp, sizeof(tmp), + "ACCOUNT=%s;", job_ptr->account); + xstrcat(buf, tmp); + } + + if (job_ptr->comment && job_ptr->comment[0]) { + snprintf(tmp,sizeof(tmp), + "COMMENT=%s;", job_ptr->comment); + xstrcat(buf,tmp); + } + if (state_info == SLURM_INFO_VOLITILE) return buf; @@ -442,3 +454,37 @@ extern char * bitmap2wiki_node_name(bitstr_t *bitmap) } return buf; } + + +/* Return task list in Maui format: tux0:tux0:tux1:tux1:tux2 */ +static char * _task_list(struct job_record *job_ptr) +{ + int i, j, task_cnt; + char *buf = NULL, *host; + hostlist_t hl = hostlist_create(job_ptr->nodes); + + buf = xstrdup(""); + if (hl == NULL) + return buf; + + for (i=0; i<job_ptr->alloc_lps_cnt; i++) { + host = hostlist_shift(hl); + if (host == NULL) { + error("bad alloc_lps_cnt for job %u (%s, %d)", + job_ptr->job_id, job_ptr->nodes, + job_ptr->alloc_lps_cnt); + break; + } + task_cnt = job_ptr->alloc_lps[i]; + if (job_ptr->details && job_ptr->details->cpus_per_task) + task_cnt /= job_ptr->details->cpus_per_task; + for (j=0; j<task_cnt; j++) { + if (buf) + xstrcat(buf, ":"); + xstrcat(buf, host); + } + free(host); + } + hostlist_destroy(hl); + return buf; +} diff --git a/src/plugins/sched/wiki2/hostlist.c b/src/plugins/sched/wiki2/hostlist.c index c31dc6bf892..7a96bd30cc5 100644 --- a/src/plugins/sched/wiki2/hostlist.c +++ b/src/plugins/sched/wiki2/hostlist.c @@ -160,7 +160,7 @@ extern char * slurm_job2moab_task_list(struct job_record *job_ptr) /* Return task list in Moab format 1: tux0:tux0:tux1:tux1:tux2 */ static char * _task_list(struct job_record *job_ptr) { - int i, j; + int i, j, task_cnt; char *buf = NULL, *host; hostlist_t hl = hostlist_create(job_ptr->nodes); @@ -178,7 +178,10 @@ static char * _task_list(struct job_record *job_ptr) job_ptr->alloc_lps_cnt); break; } - for (j=0; j<job_ptr->alloc_lps[i]; j++) { + task_cnt = job_ptr->alloc_lps[i]; + if (job_ptr->details && job_ptr->details->cpus_per_task) + task_cnt /= job_ptr->details->cpus_per_task; + for (j=0; j<task_cnt; j++) { if (buf) xstrcat(buf, ":"); xstrcat(buf, host); @@ -247,7 +250,7 @@ static void _append_hl_buf(char **buf, hostlist_t *hl_tmp, int *reps) /* Return task list in Moab format 2: tux[0-1]*2:tux2 */ static char * _task_list_exp(struct job_record *job_ptr) { - int i, reps = -1; + int i, reps = -1, task_cnt; char *buf = NULL, *host; hostlist_t hl = hostlist_create(job_ptr->nodes); hostlist_t hl_tmp = (hostlist_t) NULL; @@ -267,7 +270,10 @@ static char * _task_list_exp(struct job_record *job_ptr) break; } - if (reps == job_ptr->alloc_lps[i]) { + task_cnt = job_ptr->alloc_lps[i]; + if (job_ptr->details && job_ptr->details->cpus_per_task) + task_cnt /= job_ptr->details->cpus_per_task; + if (reps == task_cnt) { /* append to existing hostlist record */ if (hostlist_push(hl_tmp, host) == 0) error("hostlist_push failure"); @@ -278,7 +284,7 @@ static char * _task_list_exp(struct job_record *job_ptr) /* start new hostlist record */ hl_tmp = hostlist_create(host); if (hl_tmp) - reps = job_ptr->alloc_lps[i]; + reps = task_cnt; else error("hostlist_create failure"); } diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c index 7eecac2d9b7..9b9041e2445 100644 --- a/src/sbatch/opt.c +++ b/src/sbatch/opt.c @@ -145,6 +145,7 @@ static void _opt_pbs_batch_script(const void *body, int size); /* set options based upon env vars */ static void _opt_env(void); +static void _proc_get_user_env(char *optarg); /* list known options and their settings */ static void _opt_list(void); @@ -278,7 +279,9 @@ static void _opt_default() opt.ifname = xstrdup("/dev/null"); opt.ofname = NULL; opt.efname = NULL; - opt.get_user_env = -1; + + opt.get_user_env_time = -1; + opt.get_user_env_mode = -1; } /*---[ env var processing ]-----------------------------------------------*/ @@ -1213,9 +1216,9 @@ static void _set_options(int argc, char **argv) break; case LONG_OPT_GET_USER_ENV: if (optarg) - opt.get_user_env = strtol(optarg, NULL, 10); + _proc_get_user_env(optarg); else - opt.get_user_env = 0; + opt.get_user_env_time = 0; break; default: fatal("Unrecognized command line parameter %c", @@ -1228,6 +1231,25 @@ static void _set_options(int argc, char **argv) } } +static void _proc_get_user_env(char *optarg) +{ + char *end_ptr; + + if ((optarg[0] >= '0') && (optarg[0] <= '9')) + opt.get_user_env_time = strtol(optarg, &end_ptr, 10); + else { + opt.get_user_env_time = 0; + end_ptr = optarg; + } + + if ((end_ptr == NULL) || (end_ptr[0] == '\0')) + return; + if ((end_ptr[0] == 's') || (end_ptr[0] == 'S')) + opt.get_user_env_mode = 1; + else if ((end_ptr[0] == 'l') || (end_ptr[0] == 'L')) + opt.get_user_env_mode = 2; +} + static void _set_pbs_options(int argc, char **argv) { int opt_char, option_index = 0; diff --git a/src/sbatch/opt.h b/src/sbatch/opt.h index cb8097a05ad..ab0d10c0f1e 100644 --- a/src/sbatch/opt.h +++ b/src/sbatch/opt.h @@ -133,7 +133,8 @@ typedef struct sbatch_options { char *ifname; /* input file name */ char *ofname; /* output file name */ char *efname; /* error file name */ - int get_user_env; /* --get-user-env[=timeout] */ + int get_user_env_time; /* --get-user-env[=timeout] */ + int get_user_env_mode; /* --get-user-env=[S|L] */ } opt_t; extern opt_t opt; diff --git a/src/sbatch/sbatch.c b/src/sbatch/sbatch.c index 75f0082e0de..98fe308fdcc 100644 --- a/src/sbatch/sbatch.c +++ b/src/sbatch/sbatch.c @@ -218,12 +218,13 @@ static int fill_job_desc_from_opts(job_desc_msg_t *desc) desc->shared = opt.shared; desc->environment = NULL; - if (opt.get_user_env >= 0) { + if (opt.get_user_env_time >= 0) { struct passwd *pw = NULL; pw = getpwuid(opt.uid); if (pw != NULL) { desc->environment = env_array_user_default(pw->pw_name, - opt.get_user_env); + opt.get_user_env_time, + opt.get_user_env_mode); /* FIXME - should we abort if j->environment * is NULL? */ } diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 2185dd3b9ac..b2b4f29aadf 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -766,7 +766,7 @@ step_create(job_step_create_request_msg_t *step_specs, if (job_ptr == NULL) return ESLURM_INVALID_JOB_ID ; - if (job_ptr->job_state == JOB_SUSPENDED) + if ((job_ptr->job_state == JOB_SUSPENDED) || IS_JOB_PENDING(job_ptr)) return ESLURM_DISABLED; if (batch_step) { @@ -781,9 +781,6 @@ step_create(job_step_create_request_msg_t *step_specs, (step_specs->user_id != 0)) return ESLURM_ACCESS_DENIED ; - if (IS_JOB_PENDING(job_ptr)) - return ESLURM_INVALID_JOB_ID ; - if (IS_JOB_FINISHED(job_ptr) || (job_ptr->end_time <= time(NULL))) return ESLURM_ALREADY_DONE; -- GitLab