From 6dffdb9a60ecf366a9454ff46195c52926055b8f Mon Sep 17 00:00:00 2001
From: "Christopher J. Morrone" <morrone2@llnl.gov>
Date: Thu, 20 Jul 2006 03:17:02 +0000
Subject: [PATCH] Changes to salloc:   - returns the return code of its child
 command   - add timeout for limiting how long it waits for a PENDING job
 allocation     (slurm_allocate_resources_blocking was updated to timeout
 correctly)   - cleans up job allocation on Ctrl-C (Actually, it ignores
 SIGINT, SIGTERM,     SIGQUIT, but unblocks them for the child, so ctrl-c from
 the shell     hits the whole process group, and the child exits, and salloc
 cleans     up like normal.  Might to be more proactive with signals, but I   
  am not yet certain.)   - trim more unnecessary command line options   - man
 page additions

---
 doc/man/man1/salloc.1 | 13 +++++++----
 slurm/slurm.h.in      |  3 ++-
 src/api/allocate.c    | 36 ++++++++++++++++++-----------
 src/salloc/opt.c      | 53 ++++---------------------------------------
 src/salloc/opt.h      |  3 ---
 src/salloc/salloc.c   | 43 ++++++++++++++++++++++++-----------
 6 files changed, 68 insertions(+), 83 deletions(-)

diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1
index c048d2fd8da..be61083b0da 100644
--- a/doc/man/man1/salloc.1
+++ b/doc/man/man1/salloc.1
@@ -14,15 +14,20 @@ The command may be any program the user wishes.  Some typical commands are xterm
 .SH "OPTIONS"
 .LP 
 .TP 
-\fB\-\-jobid\fR <\fIJOBID\fP>
+\fB\-\-jobid\fR[=]<\fIJOBID\fP>
 The job allocation under which the parallel application should be launched.
 .TP 
-\fB\-n\fR, \fB\-\-ntasks\fR=<\fInumber\fR>
+\fB\-n\fR, \fB\-\-ntasks\fR[=]<\fInumber\fR>
 Specify the number of processes to launch.  The default is one process per node, but note that the \-c parameter will change this default.
 .TP 
-\fB\-N\fR, \fB\-\-nodes\fR=<\fInumber|[min]\-[max]\fR>
+\fB\-N\fR, \fB\-\-nodes\fR[=]<\fInumber|[min]\-[max]\fR>
 Specify the number of nodes to be used by this job step.  This option accepts either a single number, or a range of possible node counts.  If a single number is used, such as "\-N 4", then the allocation is asking for four and ONLY four nodes.  If a range is specified, such as "\-N 2\-6", SLURM controller may grant salloc anywhere from 2 to 6 nodes.  When using a range, either of the min or max options may be omitted.  For instance, "\-N 10\-" means "no fewer than 10 nodes", and "\-N \-20" means "no more than 20 nodes".  The default value of this option is one node, but other options may require more than one node to be allocated.
-
+.TP 
+\fB\-I\fR,\fB\-\-immediate\fR
+Do not wait for the resources need to grant this allocation.  Normally salloc will wait for the resources necessary to satisfy the requested job allocation, but when \-\-immediate is specified it will exit immediately without running the \fIcommand\fR parameter.
+.TP 
+\fB\-W\fR, \fB\-\-wait\fR[=]<\fIseconds\fR>
+If the resources needed to satisy a job allocation are not immediately available, the job allocation is enqueued and is said to be PENDING.  This option tells salloc how long (in seconds) to wait for the allocation to be granted before giving up.  When the wait limit has been reached, salloc will exit without running the \fIcommand\fR parameter.  By default, salloc will wait indefinitely.  (The \-\-immediate option makes \-\-wait moot.)
 .TP 
 \fB\-c\fR, \fB\-\-cpus\-per\-task\fR=<\fIncpus\fR>
 
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index 8249b089d49..87f79562c89 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -761,7 +761,8 @@ extern int slurm_allocate_resources PARAMS((
  *	A timeout of zero will wait indefinitely.
  * 
  * RET allocation structure on success, NULL on error set errno to
- *	indicate the error
+ *	indicate the error (errno will be ETIMEDOUT if the timeout is reached
+ *      with no allocation granted)
  * NOTE: free the allocation structure using
  *	slurm_free_resource_allocation_response_msg
  */
diff --git a/src/api/allocate.c b/src/api/allocate.c
index b59051a09d7..d602c52de8d 100644
--- a/src/api/allocate.c
+++ b/src/api/allocate.c
@@ -146,12 +146,14 @@ slurm_allocate_resources (job_desc_msg_t *req,
  *	A timeout of zero will wait indefinitely.
  * 
  * RET allocation structure on success, NULL on error set errno to
- *	indicate the error
+ *	indicate the error (errno will be ETIMEDOUT if the timeout is reached
+ *      with no allocation granted)
  * NOTE: free the allocation structure using
  *	slurm_free_resource_allocation_response_msg
  */
 resource_allocation_response_msg_t *
-slurm_allocate_resources_blocking (const job_desc_msg_t *user_req, time_t timeout)
+slurm_allocate_resources_blocking (const job_desc_msg_t *user_req,
+				   time_t timeout)
 {
 	int rc;
 	slurm_msg_t req_msg;
@@ -236,14 +238,14 @@ slurm_allocate_resources_blocking (const job_desc_msg_t *user_req, time_t timeou
 			/* no, we need to wait for a response */
 			job_id = resp->job_id;
 			slurm_free_resource_allocation_response_msg(resp);
-			verbose("Pending job allocation %u", job_id);
+			info("Pending job allocation %u", job_id);
  			resp = _wait_for_allocation_response(job_id, listen,
 							     timeout);
 			/* If NULL, we didn't get the allocation in 
 			   the time desired, so just free the job id */
 			if (resp == NULL) {
+				errnum = errno;
 				slurm_complete_job(job_id, -1);
-				errnum = -1;
 			}
 		}
 		break;
@@ -687,11 +689,12 @@ _wait_for_alloc_rpc(const listen_t *listen, int sleep_time,
 		    resource_allocation_response_msg_t **resp)
 {
 	struct pollfd fds[1];
+	int rc;
 
 	fds[0].fd = listen->fd;
 	fds[0].events = POLLIN;
 
-	while (poll (fds, 1, (sleep_time * 1000)) < 0) {
+	while ((rc = poll(fds, 1, (sleep_time * 1000))) < 0) {
 		switch (errno) {
 			case EAGAIN:
 			case EINTR:
@@ -708,20 +711,25 @@ _wait_for_alloc_rpc(const listen_t *listen, int sleep_time,
 		}
 	}
 
-	if (fds[0].revents & POLLIN)
+	if (rc == 0) { /* poll timed out */
+		errno = ETIMEDOUT;
+	} else if (fds[0].revents & POLLIN) {
 		return (_accept_msg_connection(listen->fd, resp));
+	}
 
-	return (0);
+	return 0;
 }
 
 static resource_allocation_response_msg_t *
 _wait_for_allocation_response(uint32_t job_id, const listen_t *listen,
 			      int timeout)
 {
-	resource_allocation_response_msg_t *resp;
+	resource_allocation_response_msg_t *resp = NULL;
+	int errnum;
 
-	debug ("job %u queued and waiting for resources", job_id);
+	debug("job %u queued and waiting for resources", job_id);
 	if (_wait_for_alloc_rpc(listen, timeout, &resp) <= 0) {
+		errnum = errno;
 		/* Maybe the resource allocation response RPC got lost
 		 * in the mail; surely it should have arrived by now.
 		 * Let's see if the controller thinks that the allocation
@@ -730,10 +738,12 @@ _wait_for_allocation_response(uint32_t job_id, const listen_t *listen,
 		if (slurm_allocation_lookup(job_id, &resp) >= 0)
 			return resp;
 
-		if (slurm_get_errno() == ESLURM_JOB_PENDING) 
-			debug3 ("Still waiting for allocation");
-		else {
-			debug3 ("Unable to confirm allocation for job %u: %m", 
+		if (slurm_get_errno() == ESLURM_JOB_PENDING) {
+			debug3("Still waiting for allocation");
+			errno = errnum;
+			return NULL;
+		} else {
+			debug3("Unable to confirm allocation for job %u: %m", 
 			       job_id);
 			return NULL;
 		}
diff --git a/src/salloc/opt.c b/src/salloc/opt.c
index acfda5da907..a5c4eff1155 100644
--- a/src/salloc/opt.c
+++ b/src/salloc/opt.c
@@ -84,7 +84,6 @@
 #define OPT_CONN_TYPE	0x08
 #define OPT_NO_ROTATE	0x0a
 #define OPT_GEOMETRY	0x0b
-#define OPT_MPI         0x0c
 #define OPT_CPU_BIND    0x0d
 #define OPT_MEM_BIND    0x0e
 
@@ -100,7 +99,6 @@
 #define LONG_OPT_CONT        0x109
 #define LONG_OPT_UID         0x10a
 #define LONG_OPT_GID         0x10b
-#define LONG_OPT_MPI         0x10c
 #define LONG_OPT_CORE	     0x10e
 #define LONG_OPT_NOSHELL     0x10f
 #define LONG_OPT_DEBUG_TS    0x110
@@ -742,10 +740,9 @@ static void _opt_default()
 	opt.no_requeue	= false;
 
 	opt.noshell	= false;
-	opt.max_wait	= slurm_get_wait_time();
+	opt.max_wait	= 0;
 
 	opt.quit_on_intr = false;
-	opt.disable_status = false;
 	opt.test_only   = false;
 
 	opt.quiet = 0;
@@ -815,8 +812,6 @@ env_vars_t env_vars[] = {
   {"SLURM_PARTITION",     OPT_STRING,     &opt.partition,     NULL           },
   {"SLURM_TIMELIMIT",     OPT_INT,        &opt.time_limit,    NULL           },
   {"SLURM_WAIT",          OPT_INT,        &opt.max_wait,      NULL           },
-  {"SLURM_DISABLE_STATUS",OPT_INT,        &opt.disable_status,NULL           },
-  {"SLURM_MPI_TYPE",      OPT_MPI,        NULL,               NULL           },
   {NULL, 0, NULL, NULL}
 };
 
@@ -919,15 +914,6 @@ _process_env_var(env_vars_t *e, const char *val)
 			      e->var, val);
 		}
 		break;
-
-	case OPT_MPI:
-		if (srun_mpi_init((char *)val) == SLURM_ERROR) {
-			fatal("\"%s=%s\" -- invalid MPI type, "
-			      "--mpi=list for acceptable types.",
-			      e->var, val);
-		}
-		break;
-
 	default:
 		/* do nothing */
 		break;
@@ -962,7 +948,6 @@ void set_options(const int argc, char **argv, int first)
 {
 	int opt_char, option_index = 0;
 	static bool set_name=false;
-	struct utsname name;
 	static struct option long_options[] = {
 		{"cpus-per-task", required_argument, 0, 'c'},
 		{"constraint",    required_argument, 0, 'C'},
@@ -988,15 +973,12 @@ void set_options(const int argc, char **argv, int first)
 		{"nodelist",      required_argument, 0, 'w'},
 		{"wait",          required_argument, 0, 'W'},
 		{"exclude",       required_argument, 0, 'x'},
-		{"disable-status", no_argument,      0, 'X'},
-		{"no-allocate",   no_argument,       0, 'Z'},
 		{"contiguous",       no_argument,       0, LONG_OPT_CONT},
 		{"exclusive",        no_argument,       0, LONG_OPT_EXCLUSIVE},
 		{"cpu_bind",         required_argument, 0, LONG_OPT_CPU_BIND},
 		{"mem_bind",         required_argument, 0, LONG_OPT_MEM_BIND},
 		{"mincpus",          required_argument, 0, LONG_OPT_MINCPU},
 		{"mem",              required_argument, 0, LONG_OPT_MEM},
-		{"mpi",              required_argument, 0, LONG_OPT_MPI},
 		{"no-shell",         no_argument,       0, LONG_OPT_NOSHELL},
 		{"tmp",              required_argument, 0, LONG_OPT_TMP},
 		{"msg-timeout",      required_argument, 0, LONG_OPT_TIMEO},
@@ -1021,7 +1003,7 @@ void set_options(const int argc, char **argv, int first)
 		{NULL,               0,                 0, 0}
 	};
 	char *opt_string = "+a:c:C:g:HIJ:km:N:"
-		"Op:P:qQR:st:U:vVw:W:x:XZ";
+		"Op:P:qQR:st:U:vVw:W:x:";
 
 	if(opt.progname == NULL)
 		opt.progname = xbasename(argv[0]);
@@ -1185,15 +1167,6 @@ void set_options(const int argc, char **argv, int first)
 			if (!_valid_node_list(&opt.exc_nodes))
 				exit(1);
 			break;
-		case (int)'X': 
-			opt.disable_status = true;
-			break;
-		case (int)'Z':
-			opt.no_alloc = true;
-			uname(&name);
-			if (strcasecmp(name.sysname, "AIX") == 0)
-				opt.network = xstrdup("ip");
-			break;
 		case LONG_OPT_CONT:
 			opt.contiguous = true;
 			break;
@@ -1221,13 +1194,6 @@ void set_options(const int argc, char **argv, int first)
 				exit(1);
 			}
 			break;
-		case LONG_OPT_MPI:
-			if (srun_mpi_init((char *)optarg) == SLURM_ERROR) {
-				fatal("\"--mpi=%s\" -- long invalid MPI type, "
-				      "--mpi=list for acceptable types.",
-				      optarg);
-			}
-			break;
 		case LONG_OPT_NOSHELL:
 			opt.noshell = true;
 			break;
@@ -1381,16 +1347,6 @@ static bool _opt_verify(void)
 		verified = false;
 	}
 
-	if (opt.no_alloc && !opt.nodelist) {
-		error("must specify a node list with -Z, --no-allocate.");
-		verified = false;
-	}
-
-	if (opt.no_alloc && opt.exc_nodes) {
-		error("can not specify --exclude list with -Z, --no-allocate.");
-		verified = false;
-	}
-
 	if (opt.mincpus < opt.cpus_per_task)
 		opt.mincpus = opt.cpus_per_task;
 
@@ -1685,10 +1641,9 @@ static void _help(void)
 "                              (type = block|cyclic|hostfile)\n"
 "  -J, --job-name=jobname      name of job\n"
 "      --mpi=type              type of MPI being used\n"
-"  -W, --wait=sec              seconds to wait after first task exits\n"
-"                              before killing job\n"
+"  -W, --wait=sec              seconds to wait for allocation if not\n"
+"                              immediately available\n"
 "  -q, --quit-on-interrupt     quit on single Ctrl-C\n"
-"  -X, --disable-status        Disable Ctrl-C status feature\n"
 "  -v, --verbose               verbose mode (multiple -v's increase verbosity)\n"
 "  -Q, --quiet                 quiet mode (suppress informational messages)\n"
 "  -d, --slurmd-debug=level    slurmd debug level\n"
diff --git a/src/salloc/opt.h b/src/salloc/opt.h
index c81e3ea98d4..77659199308 100644
--- a/src/salloc/opt.h
+++ b/src/salloc/opt.h
@@ -81,7 +81,6 @@ typedef struct salloc_options {
 	enum task_dist_states
 		distribution;	/* --distribution=, -m dist	*/
 	char *job_name;		/* --job-name=,     -J name	*/
-	char *mpi_type;		/* --mpi=type			*/
 	unsigned int dependency;/* --dependency, -P jobid	*/
 	int nice;		/* --nice			*/
 	char *account;		/* --account, -U acct_name	*/
@@ -96,7 +95,6 @@ typedef struct salloc_options {
 	bool share;		/* --share,   -s		*/
 	int  max_wait;		/* --wait,    -W		*/
 	bool quit_on_intr;      /* --quit-on-interrupt, -q      */
-	bool disable_status;    /* --disable-status, -X         */
 	int  quiet;
 	bool test_only;		/* --test-only			*/
 	char *propagate;	/* --propagate[=RLIMIT_CORE,...]*/
@@ -109,7 +107,6 @@ typedef struct salloc_options {
 	bool contiguous;	/* --contiguous			*/
 	char *nodelist;		/* --nodelist=node1,node2,...	*/
 	char *exc_nodes;	/* --exclude=node1,node2,... -x	*/
-	bool no_alloc;		/* --no-allocate, -Z		*/
 	int  max_launch_time;   /* Undocumented                 */
 	int  max_exit_timeout;  /* Undocumented                 */
 	int  msg_timeout;       /* Undocumented                 */
diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c
index 749cb70d51b..bd60e94a927 100644
--- a/src/salloc/salloc.c
+++ b/src/salloc/salloc.c
@@ -42,6 +42,7 @@
 
 #include "src/common/xstring.h"
 #include "src/common/xmalloc.h"
+#include "src/common/xsignal.h"
 #include "src/common/read_config.h"
 
 #include "src/salloc/opt.h"
@@ -49,7 +50,9 @@
 
 static int fill_job_desc_from_opts(job_desc_msg_t *desc);
 static void ring_terminal_bell(void);
-static void run_command(void);
+static int run_command(char **command);
+
+static int signals_to_block[] = {SIGINT, SIGTERM, SIGQUIT, 0};
 
 int main(int argc, char *argv[])
 {
@@ -58,6 +61,7 @@ int main(int argc, char *argv[])
 	resource_allocation_response_msg_t *alloc;
 	time_t before, after;
 	salloc_msg_thread_t *msg_thr;
+	int rc;
 
 	log_init(xbasename(argv[0]), logopt, 0, NULL);
 	if (initialize_and_process_args(argc, argv) < 0) {
@@ -71,6 +75,8 @@ int main(int argc, char *argv[])
 		log_alter(logopt, 0, NULL);
 	}
 
+	xsignal_block(signals_to_block);
+
 	/*
 	 * Request a job allocation
 	 */
@@ -86,7 +92,7 @@ int main(int argc, char *argv[])
 	verbose("other_hostname = %s", desc.other_hostname);
 
 	before = time(NULL);
-	alloc = slurm_allocate_resources_blocking(&desc, 0);
+	alloc = slurm_allocate_resources_blocking(&desc, opt.max_wait);
 	if (alloc == NULL) 
 		fatal("Failed to allocate resources: %m");
 	after = time(NULL);
@@ -106,7 +112,7 @@ int main(int argc, char *argv[])
 	 */
 	setenvfs("SLURM_JOBID=%d", alloc->job_id);
 	setenvfs("SLURM_NNODES=%d", alloc->node_cnt);
-	run_command();
+	rc = run_command(command_argv);
 
 	/*
 	 * Relinquish the job allocation.
@@ -119,7 +125,7 @@ int main(int argc, char *argv[])
 	slurm_free_resource_allocation_response_msg(alloc);
 	msg_thr_destroy(msg_thr);
 
-	return 0;
+	return rc;
 }
 
 
@@ -128,7 +134,7 @@ static int fill_job_desc_from_opts(job_desc_msg_t *desc)
 {
 	desc->contiguous = opt.contiguous ? 1 : 0;
 	desc->features = opt.constraints;
-	desc->immediate = opt.immediate;
+	desc->immediate = opt.immediate ? 1 : 0;
 	desc->name = opt.job_name;
 	desc->req_nodes = opt.nodelist;
 	if (desc->req_nodes == NULL) {
@@ -214,25 +220,36 @@ static void ring_terminal_bell(void)
 	fflush(stdout);
 }
 
-static void run_command(void)
+/* returns the exit status of the command */
+static int run_command(char **command)
 {
 	pid_t pid;
 	int status;
-	int rc;
+	int rc = 0;
 
 	pid = fork();
 	if (pid < 0) {
 		error("fork failed: %m");
 	} else if (pid > 0) {
 		/* parent */
-		while ((rc = waitpid(pid, &status, 0)) == -1) {
-			if (errno != EINTR) {
-				error("waitpid failed: %m");
-				break;
-			}
+		while ((rc = waitpid(pid, &status, 0)) == -1
+		       && errno == EINTR) {
+			/* just keep spinning */
+		}
+		if (rc == -1) {
+			error("waitpid for %s failed: %m", command[0]);
+			rc = 1;
+		} else {
+			if (WIFEXITED(status))
+				rc = WEXITSTATUS(status);
+			else
+				rc = 1;
 		}
 	} else {
 		/* child */
-		execvp(command_argv[0], command_argv);
+		xsignal_unblock(signals_to_block);
+		execvp(command[0], command);
 	}
+
+	return rc;
 }
-- 
GitLab