From ccb12c684bccb0fcf6b8ff7e982faeb186c1e5fd Mon Sep 17 00:00:00 2001
From: "Christopher J. Morrone" <morrone2@llnl.gov>
Date: Fri, 4 Aug 2006 03:16:50 +0000
Subject: [PATCH] Greatly improve salloc's handling of SIGINT.

If the allocation has been granted, it will release the job
allocation on the first SIGINT, and ignore any following SIGINT signals
while waiting for the user's command to exit.

If the allocation if pending, salloc revokes the allocation and exits.

Should all other signals have the same behaviour?
---
 src/api/allocate.c  |   7 ++--
 src/salloc/salloc.c | 100 ++++++++++++++++++++++++++++++--------------
 2 files changed, 73 insertions(+), 34 deletions(-)

diff --git a/src/api/allocate.c b/src/api/allocate.c
index 9a9ea957ef6..76898a6d673 100644
--- a/src/api/allocate.c
+++ b/src/api/allocate.c
@@ -241,7 +241,7 @@ slurm_allocate_resources_blocking (const job_desc_msg_t *user_req,
 			/* no, we need to wait for a response */
 			job_id = resp->job_id;
 			slurm_free_resource_allocation_response_msg(resp);
-			info("Pending job allocation %u", job_id);
+			verbose("Pending job allocation %u", job_id);
  			resp = _wait_for_allocation_response(job_id, listen,
 							     timeout);
 			/* If NULL, we didn't get the allocation in 
@@ -645,12 +645,13 @@ _accept_msg_connection(int listen_fd, resource_allocation_response_msg_t **resp)
 	msg->ret_list = NULL;
 	msg->conn_fd = conn_fd;
 	
-  again:
 	ret_list = slurm_receive_msg(conn_fd, msg, 0);
 
 	if (!ret_list || errno != SLURM_SUCCESS) {
 		if (errno == EINTR) {
-			goto again;
+			slurm_close_accepted_conn(conn_fd);
+			*resp = NULL;
+			return 0;
 		}
 		if (ret_list)
 			list_destroy(ret_list);
diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c
index 132e5f5e339..ec3b39f75d3 100644
--- a/src/salloc/salloc.c
+++ b/src/salloc/salloc.c
@@ -51,9 +51,8 @@
 
 static int fill_job_desc_from_opts(job_desc_msg_t *desc);
 static void ring_terminal_bell(void);
-static int run_command(char **command);
-
-static int signals_to_block[] = {SIGINT, SIGTERM, SIGQUIT, 0};
+static int fork_command(char **command);
+static void _ignore_signal(int);
 
 int main(int argc, char *argv[])
 {
@@ -62,8 +61,14 @@ int main(int argc, char *argv[])
 	resource_allocation_response_msg_t *alloc;
 	time_t before, after;
 	salloc_msg_thread_t *msg_thr;
-	int rc;
 	char **env = NULL;
+	int status;
+	int errnum = 0;
+	pid_t pid;
+	pid_t rc_pid = 0;
+	int rc = 0;
+
+	xsignal(SIGINT, _ignore_signal);
 
 	log_init(xbasename(argv[0]), logopt, 0, NULL);
 	if (initialize_and_process_args(argc, argv) < 0) {
@@ -77,8 +82,6 @@ int main(int argc, char *argv[])
 		log_alter(logopt, 0, NULL);
 	}
 
-	xsignal_block(signals_to_block);
-
 	/*
 	 * Request a job allocation
 	 */
@@ -112,20 +115,64 @@ int main(int argc, char *argv[])
 	 */
 	env = env_array_create_for_job(alloc);
 	env_array_set_environment(env);
-	rc = run_command(command_argv);
 	env_array_free(env);
+	pid = fork_command(command_argv);
+
+	/*
+	 * Wait for command to exit, OR for waitpid to be interrupted by a
+	 * signal.  Either way, we are going to release the allocation next.
+	 */
+	if (pid > 0) {
+		rc_pid = waitpid(pid, &status, 0);
+		errnum = errno;
+		if (rc_pid == -1 && errnum != EINTR)
+			error("waitpid for %s failed: %m", command_argv[0]);
+	}
 
 	/*
 	 * Relinquish the job allocation.
 	 */
-	verbose("Relinquishing job allocation %d", alloc->job_id);
+	if (pid > 0 && rc_pid == -1 && errnum == EINTR) {
+		info("Relinquishing job allocation %d", alloc->job_id);
+	} else {
+		verbose("Relinquishing job allocation %d", alloc->job_id);
+	}
 	if (slurm_complete_job(alloc->job_id, 0) != 0)
-		fatal("Unable to clean up job allocation %d: %m",
+		error("Unable to clean up job allocation %d: %m",
 		      alloc->job_id);
 
 	slurm_free_resource_allocation_response_msg(alloc);
 	msg_thr_destroy(msg_thr);
 
+	/*
+	 * If the earlier waitpid was interrupted by a signal,
+	 * wait indefinitely for the process to exit.
+	 */
+	if (pid > 0 && rc_pid == -1 && errnum == EINTR) {
+		while ((rc_pid = waitpid(pid, &status, 0)) == -1
+		       && errno == EINTR) {
+			/* just keep spinning */
+			info("Job already released, waiting for command");
+		}
+		if (rc_pid == -1)
+			error("second waitpid for %s failed: %m",
+			      command_argv[0]);
+	}
+
+	/*
+	 * Figure out what return code we should use.  If the user's command
+	 * exitted normally, return the user's return code.
+	 */
+	rc = 1;
+	if (rc_pid != -1) {
+		if (WIFEXITED(status)) {
+			rc = WEXITSTATUS(status);
+		} else if (WIFSIGNALED(status)) {
+			verbose("Command \"%s\" was terminated by signal %d",
+				WTERMSIG(status));
+		}
+	}
+
 	return rc;
 }
 
@@ -221,36 +268,27 @@ static void ring_terminal_bell(void)
 	fflush(stdout);
 }
 
-/* returns the exit status of the command */
-static int run_command(char **command)
+/* returns the pid of the forked command, or <0 on error */
+static pid_t fork_command(char **command)
 {
 	pid_t pid;
-	int status;
-	int rc = 0;
 
 	pid = fork();
 	if (pid < 0) {
 		error("fork failed: %m");
-	} else if (pid > 0) {
-		/* parent */
-		while ((rc = waitpid(pid, &status, 0)) == -1
-		       && errno == EINTR) {
-			/* just keep spinning */
-		}
-		if (rc == -1) {
-			error("waitpid for %s failed: %m", command[0]);
-			rc = 1;
-		} else {
-			if (WIFEXITED(status))
-				rc = WEXITSTATUS(status);
-			else
-				rc = 1;
-		}
-	} else {
+	} else if (pid == 0) {
 		/* child */
-		xsignal_unblock(signals_to_block);
 		execvp(command[0], command);
+
+		/* should only get here if execvp failed */
+		error("Unable to exec command \"%s\"", command[0]);
+		exit(1);
 	}
+	/* parent returns */
+	return pid;
+}
 
-	return rc;
+static void _ignore_signal(int foo)
+{
+	/* do nothing */
 }
-- 
GitLab