From f93df7c5127e98be7a173f87a9f36b1f36c92a8a Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 27 Nov 2007 22:27:27 +0000
Subject: [PATCH] svn merge -r12667:12698
 https://eris.llnl.gov/svn/slurm/branches/slurm-1.2

---
 NEWS                               |  3 ++
 doc/man/man1/sbatch.1              | 26 ++++++++----
 src/common/env.c                   | 13 ++++--
 src/common/env.h                   | 11 +++--
 src/plugins/sched/wiki/get_jobs.c  | 68 +++++++++++++++++++++++++-----
 src/plugins/sched/wiki2/hostlist.c | 16 ++++---
 src/sbatch/opt.c                   | 28 ++++++++++--
 src/sbatch/opt.h                   |  3 +-
 src/sbatch/sbatch.c                |  5 ++-
 src/slurmctld/step_mgr.c           |  5 +--
 10 files changed, 138 insertions(+), 40 deletions(-)

diff --git a/NEWS b/NEWS
index 6d5c8e37bae..469d6e57512 100644
--- a/NEWS
+++ b/NEWS
@@ -113,6 +113,9 @@ documents those changes that are of interest to users and admins.
     the job. Only send it to specific nodes which have not reported completion.
  -- Support larger environment variables 64K instead of BUFSIZ (8k on some 
     systems).
+ -- If a job is being requeued, job step create requests will print a 
+    warning and repeatedly retry rather than aborting.
+ -- Add optional mode value to srun and sbatch --get-user-env option.
 
 * Changes in SLURM 1.2.19
 =========================
diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1
index f2410b8c666..b4bd85ecfe6 100644
--- a/doc/man/man1/sbatch.1
+++ b/doc/man/man1/sbatch.1
@@ -174,15 +174,25 @@ The order of the node names in the list is not important; the node names
 will be sorted my SLURM.
 
 .TP
-\fB\-\-get\-user\-env\fR[=\fItimeout\fR]
+\fB\-\-get\-user\-env\fR[=\fItimeout\fR][\fImode\fR]
 This option will tell sbatch to retrieve the
-login environment variables for the user specified in the \-\-uid option.
-The environment variables are retrieved by running "su - <username> -c
-/usr/bin/env" and parsing the output.  Be aware that any environment
-variables already set in sbatch's environment will take precedence over any
-environment variables in the user's login environment.
-Optional timeout value is in seconds. Default value is 8 seconds.
-NOTE: This option only works if the caller has an effective uid of "root".
+login environment variables for the user specified in the \fB\-\-uid\fR option.
+The environment variables are retrieved by running something of this sort
+"su - <username> -c /usr/bin/env" and parsing the output.
+Be aware that any environment variables already set in sbatch's environment
+will take precedence over any environment variables in the user's
+login environment.
+The optional \fItimeout\fR value is in seconds. Default value is 8 seconds.
+The optional \fImode\fR value control the "su" options.
+With a \fImode\fR value of "S", "su" is executed without the "\-" option.
+With a \fImode\fR value of "L", "su" is executed with the "\-" option,
+replicating the login environment.
+If \fImode\fR not specified, the mode established at SLURM build time
+is used.
+Example of use include "\-\-get\-user\-env", "\-\-get\-user\-env=10"
+"\-\-get\-user\-env=10L", and "\-\-get\-user\-env=S".
+NOTE: This option only works if the caller has an
+effective uid of "root".
 This option was originally created for use by Moab.
 
 .TP
diff --git a/src/common/env.c b/src/common/env.c
index 2741990550c..57827a3830d 100644
--- a/src/common/env.c
+++ b/src/common/env.c
@@ -1288,12 +1288,13 @@ char **_load_env_cache(const char *username)
  *    in the event that option 1 times out.
  *
  * timeout value is in seconds or zero for default (8 secs) 
+ * mode is 1 for short ("su <user>"), 2 for long ("su - <user>")
  * On error, returns NULL.
  *
  * NOTE: The calling process must have an effective uid of root for
  * this function to succeed.
  */
-char **env_array_user_default(const char *username, int timeout)
+char **env_array_user_default(const char *username, int timeout, int mode)
 {
 	FILE *su;
 	char line[ENV_BUFSIZE];
@@ -1332,11 +1333,17 @@ char **env_array_user_default(const char *username, int timeout)
 		snprintf(cmdstr, sizeof(cmdstr),
 			 "echo; echo; echo; echo %s; env; echo %s",
 			 starttoken, stoptoken);
+		if      (mode == 1)
+			execl("/bin/su", "su", username, "-c", cmdstr, NULL);
+		else if (mode == 2)
+			execl("/bin/su", "su", "-", username, "-c", cmdstr, NULL);
+		else {	/* Default system configuration */
 #ifdef LOAD_ENV_NO_LOGIN
-		execl("/bin/su", "su", username, "-c", cmdstr, NULL);
+			execl("/bin/su", "su", username, "-c", cmdstr, NULL);
 #else
-		execl("/bin/su", "su", "-", username, "-c", cmdstr, NULL);
+			execl("/bin/su", "su", "-", username, "-c", cmdstr, NULL);
 #endif
+		}
 		exit(1);
 	}
 
diff --git a/src/common/env.h b/src/common/env.h
index c62e0054593..44c3def0439 100644
--- a/src/common/env.h
+++ b/src/common/env.h
@@ -241,15 +241,20 @@ void env_array_set_environment(char **env_array);
 
 /*
  * Return an array of strings representing the specified user's default
- * environment variables, as determined by calling (more-or-less)
- * "/bin/su - <username> -c /usr/bin/env".
+ * environment variables following a two-prongged approach.
+ * 1. Execute (more or less): "/bin/su - <username> -c /usr/bin/env"
+ *    Depending upon the user's login scripts, this may take a very
+ *    long time to complete or possibly never return
+ * 2. Load the user environment from a cache file. This is used
+ *    in the event that option 1 times out.
  *
  * timeout value is in seconds or zero for default (8 secs)
+ * mode is 1 for short ("su <user>"), 2 for long ("su - <user>")
  * On error, returns NULL.
  *
  * NOTE: The calling process must have an effective uid of root for
  * this function to succeed.
  */
-char **env_array_user_default(const char *username, int timeout);
+char **env_array_user_default(const char *username, int timeout, int mode);
 
 #endif
diff --git a/src/plugins/sched/wiki/get_jobs.c b/src/plugins/sched/wiki/get_jobs.c
index a5f63fa07f2..e021e05a847 100644
--- a/src/plugins/sched/wiki/get_jobs.c
+++ b/src/plugins/sched/wiki/get_jobs.c
@@ -39,6 +39,7 @@
 #include <sys/types.h>
 
 #include "./msg.h"
+#include "src/common/hostlist.h"
 #include "src/common/list.h"
 #include "src/common/uid.h"
 #include "src/slurmctld/locks.h"
@@ -57,6 +58,8 @@ static uint32_t	_get_job_submit_time(struct job_record *job_ptr);
 static uint32_t	_get_job_suspend_time(struct job_record *job_ptr);
 static uint32_t	_get_job_tasks(struct job_record *job_ptr);
 static uint32_t	_get_job_time_limit(struct job_record *job_ptr);
+static char *	_task_list(struct job_record *job_ptr);
+
 
 #define SLURM_INFO_ALL		0
 #define SLURM_INFO_VOLITILE	1
@@ -210,8 +213,7 @@ static char *	_dump_job(struct job_record *job_ptr, int state_info)
 		xstrcat(buf, tmp);
 		xfree(hosts);
 	} else if (!IS_JOB_FINISHED(job_ptr)) {
-		char *hosts = bitmap2wiki_node_name(
-			job_ptr->node_bitmap);
+		char *hosts = _task_list(job_ptr);
 		snprintf(tmp, sizeof(tmp),
 			"TASKLIST=%s;", hosts);
 		xstrcat(buf, tmp);
@@ -231,15 +233,13 @@ static char *	_dump_job(struct job_record *job_ptr, int state_info)
 		(uint32_t) _get_job_time_limit(job_ptr));
 	xstrcat(buf, tmp);
 
-	if (job_ptr->job_state  == JOB_PENDING) {
-		/* Don't report actual tasks or nodes allocated since
-		 * this can impact requeue on heterogenous clusters */
-		snprintf(tmp, sizeof(tmp),
-			"TASKS=%u;NODES=%u;",
-			_get_job_tasks(job_ptr),
-			_get_job_min_nodes(job_ptr));
-		xstrcat(buf, tmp);
-	}
+	/* Don't report actual tasks or nodes allocated since
+	 * this can impact requeue on heterogenous clusters */
+	snprintf(tmp, sizeof(tmp),
+		"TASKS=%u;NODES=%u;",
+		_get_job_tasks(job_ptr),
+		_get_job_min_nodes(job_ptr));
+	xstrcat(buf, tmp);
 
 	snprintf(tmp, sizeof(tmp),
 		"DPROCS=%u;",
@@ -273,6 +273,18 @@ static char *	_dump_job(struct job_record *job_ptr, int state_info)
 		xstrcat(buf, tmp);
 	}
 
+	if (job_ptr->account) {
+		snprintf(tmp, sizeof(tmp),
+			"ACCOUNT=%s;", job_ptr->account);
+		xstrcat(buf, tmp);
+	}
+
+	if (job_ptr->comment && job_ptr->comment[0]) {
+		snprintf(tmp,sizeof(tmp),
+			"COMMENT=%s;", job_ptr->comment);
+		xstrcat(buf,tmp);
+	}
+
 	if (state_info == SLURM_INFO_VOLITILE)
 		return buf;
 
@@ -442,3 +454,37 @@ extern char *   bitmap2wiki_node_name(bitstr_t *bitmap)
 	}
 	return buf;
 }
+
+
+/* Return task list in Maui format: tux0:tux0:tux1:tux1:tux2 */
+static char * _task_list(struct job_record *job_ptr)
+{
+	int i, j, task_cnt;
+	char *buf = NULL, *host;
+	hostlist_t hl = hostlist_create(job_ptr->nodes);
+
+	buf = xstrdup("");
+	if (hl == NULL)
+		return buf;
+
+	for (i=0; i<job_ptr->alloc_lps_cnt; i++) {
+		host = hostlist_shift(hl);
+		if (host == NULL) {
+			error("bad alloc_lps_cnt for job %u (%s, %d)", 
+				job_ptr->job_id, job_ptr->nodes,
+				job_ptr->alloc_lps_cnt);
+			break;
+		}
+		task_cnt = job_ptr->alloc_lps[i];
+		if (job_ptr->details && job_ptr->details->cpus_per_task)
+			task_cnt /= job_ptr->details->cpus_per_task;
+		for (j=0; j<task_cnt; j++) {
+			if (buf)
+				xstrcat(buf, ":");
+			xstrcat(buf, host);
+		}
+		free(host);
+	}
+	hostlist_destroy(hl);
+	return buf;
+}
diff --git a/src/plugins/sched/wiki2/hostlist.c b/src/plugins/sched/wiki2/hostlist.c
index c31dc6bf892..7a96bd30cc5 100644
--- a/src/plugins/sched/wiki2/hostlist.c
+++ b/src/plugins/sched/wiki2/hostlist.c
@@ -160,7 +160,7 @@ extern char * slurm_job2moab_task_list(struct job_record *job_ptr)
 /* Return task list in Moab format 1: tux0:tux0:tux1:tux1:tux2 */
 static char * _task_list(struct job_record *job_ptr)
 {
-	int i, j;
+	int i, j, task_cnt;
 	char *buf = NULL, *host;
 	hostlist_t hl = hostlist_create(job_ptr->nodes);
 
@@ -178,7 +178,10 @@ static char * _task_list(struct job_record *job_ptr)
 				job_ptr->alloc_lps_cnt);
 			break;
 		}
-		for (j=0; j<job_ptr->alloc_lps[i]; j++) {
+		task_cnt = job_ptr->alloc_lps[i];
+		if (job_ptr->details && job_ptr->details->cpus_per_task)
+			task_cnt /= job_ptr->details->cpus_per_task;
+		for (j=0; j<task_cnt; j++) {
 			if (buf)
 				xstrcat(buf, ":");
 			xstrcat(buf, host);
@@ -247,7 +250,7 @@ static void _append_hl_buf(char **buf, hostlist_t *hl_tmp, int *reps)
 /* Return task list in Moab format 2: tux[0-1]*2:tux2 */
 static char * _task_list_exp(struct job_record *job_ptr)
 {
-	int i, reps = -1;
+	int i, reps = -1, task_cnt;
 	char *buf = NULL, *host;
 	hostlist_t hl = hostlist_create(job_ptr->nodes);
 	hostlist_t hl_tmp = (hostlist_t) NULL;
@@ -267,7 +270,10 @@ static char * _task_list_exp(struct job_record *job_ptr)
 			break;
 		}
 
-		if (reps == job_ptr->alloc_lps[i]) {
+		task_cnt = job_ptr->alloc_lps[i];
+		if (job_ptr->details && job_ptr->details->cpus_per_task)
+			task_cnt /= job_ptr->details->cpus_per_task;
+		if (reps == task_cnt) {
 			/* append to existing hostlist record */
 			if (hostlist_push(hl_tmp, host) == 0)
 				error("hostlist_push failure");
@@ -278,7 +284,7 @@ static char * _task_list_exp(struct job_record *job_ptr)
 			/* start new hostlist record */
 			hl_tmp = hostlist_create(host);
 			if (hl_tmp)
-				reps = job_ptr->alloc_lps[i];
+				reps = task_cnt;
 			else
 				error("hostlist_create failure");
 		}
diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c
index 7eecac2d9b7..9b9041e2445 100644
--- a/src/sbatch/opt.c
+++ b/src/sbatch/opt.c
@@ -145,6 +145,7 @@ static void _opt_pbs_batch_script(const void *body, int size);
 
 /* set options based upon env vars  */
 static void _opt_env(void);
+static void _proc_get_user_env(char *optarg);
 
 /* list known options and their settings  */
 static void  _opt_list(void);
@@ -278,7 +279,9 @@ static void _opt_default()
 	opt.ifname = xstrdup("/dev/null");
 	opt.ofname = NULL;
 	opt.efname = NULL;
-	opt.get_user_env = -1;
+
+	opt.get_user_env_time = -1;
+	opt.get_user_env_mode = -1;
 }
 
 /*---[ env var processing ]-----------------------------------------------*/
@@ -1213,9 +1216,9 @@ static void _set_options(int argc, char **argv)
 			break;
 		case LONG_OPT_GET_USER_ENV:
 			if (optarg)
-				opt.get_user_env = strtol(optarg, NULL, 10);
+				_proc_get_user_env(optarg);
 			else
-				opt.get_user_env = 0;
+				opt.get_user_env_time = 0;
 			break;
 		default:
 			fatal("Unrecognized command line parameter %c",
@@ -1228,6 +1231,25 @@ static void _set_options(int argc, char **argv)
 	}
 }
 
+static void _proc_get_user_env(char *optarg)
+{
+	char *end_ptr;
+
+	if ((optarg[0] >= '0') && (optarg[0] <= '9'))
+		opt.get_user_env_time = strtol(optarg, &end_ptr, 10);
+	else {
+		opt.get_user_env_time = 0;
+		end_ptr = optarg;
+	}
+
+	if ((end_ptr == NULL) || (end_ptr[0] == '\0'))
+		return;
+	if      ((end_ptr[0] == 's') || (end_ptr[0] == 'S'))
+		opt.get_user_env_mode = 1;
+	else if ((end_ptr[0] == 'l') || (end_ptr[0] == 'L'))
+		opt.get_user_env_mode = 2;
+}
+
 static void _set_pbs_options(int argc, char **argv)
 {
 	int opt_char, option_index = 0;
diff --git a/src/sbatch/opt.h b/src/sbatch/opt.h
index cb8097a05ad..ab0d10c0f1e 100644
--- a/src/sbatch/opt.h
+++ b/src/sbatch/opt.h
@@ -133,7 +133,8 @@ typedef struct sbatch_options {
 	char *ifname;		/* input file name		*/
 	char *ofname;		/* output file name		*/
 	char *efname;		/* error file name		*/
-	int get_user_env;	/* --get-user-env[=timeout]	*/
+	int get_user_env_time;	/* --get-user-env[=timeout]	*/
+	int get_user_env_mode;	/* --get-user-env=[S|L]         */
 } opt_t;
 
 extern opt_t opt;
diff --git a/src/sbatch/sbatch.c b/src/sbatch/sbatch.c
index 75f0082e0de..98fe308fdcc 100644
--- a/src/sbatch/sbatch.c
+++ b/src/sbatch/sbatch.c
@@ -218,12 +218,13 @@ static int fill_job_desc_from_opts(job_desc_msg_t *desc)
 	desc->shared = opt.shared;
 
 	desc->environment = NULL;
-	if (opt.get_user_env >= 0) {
+	if (opt.get_user_env_time >= 0) {
 		struct passwd *pw = NULL;
 		pw = getpwuid(opt.uid);
 		if (pw != NULL) {
 			desc->environment = env_array_user_default(pw->pw_name,
-						opt.get_user_env);
+						opt.get_user_env_time,
+						opt.get_user_env_mode);
 			/* FIXME - should we abort if j->environment
 			 * is NULL? */
 		}
diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index 2185dd3b9ac..b2b4f29aadf 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -766,7 +766,7 @@ step_create(job_step_create_request_msg_t *step_specs,
 	if (job_ptr == NULL)
 		return ESLURM_INVALID_JOB_ID ;
 
-	if (job_ptr->job_state == JOB_SUSPENDED)
+	if ((job_ptr->job_state == JOB_SUSPENDED) || IS_JOB_PENDING(job_ptr))
 		return ESLURM_DISABLED;
 
 	if (batch_step) {
@@ -781,9 +781,6 @@ step_create(job_step_create_request_msg_t *step_specs,
 	    (step_specs->user_id != 0))
 		return ESLURM_ACCESS_DENIED ;
 
-	if (IS_JOB_PENDING(job_ptr))
-		return ESLURM_INVALID_JOB_ID ;
-
 	if (IS_JOB_FINISHED(job_ptr) || 
 	    (job_ptr->end_time <= time(NULL)))
 		return ESLURM_ALREADY_DONE;
-- 
GitLab