From 042fe16d4280430f6f649664161f2cbb706dff4e Mon Sep 17 00:00:00 2001
From: Mark Grondona <mgrondona@llnl.gov>
Date: Mon, 5 Aug 2002 22:01:01 +0000
Subject: [PATCH]  o more changes in srun related to signals

---
 src/srun/launch.c |  1 +
 src/srun/msg.c    |  8 ++++++--
 src/srun/srun.c   | 36 +++++++++++++++++++++++-------------
 3 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/src/srun/launch.c b/src/srun/launch.c
index 33d112f8590..b69633265f8 100644
--- a/src/srun/launch.c
+++ b/src/srun/launch.c
@@ -82,6 +82,7 @@ launch(void *arg)
 		msg.tasks_to_launch = job->ntask[i];
 		msg.global_task_ids = 
 			(uint32_t *) xmalloc(job->ntask[i]*sizeof(uint32_t));
+		msg.srun_node_id = (uint32_t)i;
 
 		for (j = 0; j < job->ntask[i]; j++)
 			msg.global_task_ids[j] = taskid++;
diff --git a/src/srun/msg.c b/src/srun/msg.c
index 9d489460114..2a50f88a829 100644
--- a/src/srun/msg.c
+++ b/src/srun/msg.c
@@ -18,14 +18,18 @@ _launch_handler(job_t *job, slurm_msg_t *resp)
 	launch_tasks_response_msg_t *msg = 
 		(launch_tasks_response_msg_t *) resp->data;
 
-	debug2("recieved launch resp from %s", msg->node_name);
+	debug2("recieved launch resp from %s nodeid=%d", msg->node_name,
+			msg->srun_node_id);
 	
 	if (msg->return_code != 0)  {
 		error("recvd return code %d from %s", msg->return_code,
 				msg->node_name);
 		return;
 	} else {
-		/* job->host_state[msg->host_id] = SRUN_HOST_REPLIED; */
+		
+		if (msg->srun_node_id > 0 && msg->srun_node_id < job->nhosts)
+			job->host_state[msg->srun_node_id] = 
+				SRUN_HOST_REPLIED;
 	}
 
 }
diff --git a/src/srun/srun.c b/src/srun/srun.c
index 348cccdf2e9..d12424b1214 100644
--- a/src/srun/srun.c
+++ b/src/srun/srun.c
@@ -180,8 +180,10 @@ main(int ac, char **av)
 
 	pthread_kill(job->lid, SIGTERM);
 	pthread_kill(job->jtid, SIGTERM);
-	pthread_kill(job->ioid, SIGTERM);
 	pthread_kill(job->sigid, SIGTERM);
+	fflush(stderr);
+	fflush(stdout);
+	pthread_kill(job->ioid, SIGTERM);
 
 	exit(0);
 }
@@ -227,7 +229,7 @@ allocate_nodes(void)
 	if (rc == SLURM_FAILURE) {
 		error("Unable to allocate resources: %s", 
 				slurm_strerror(errno));
-		exit(1);
+		return NULL;
 	}
 
 	return resp;
@@ -320,26 +322,34 @@ sig_thr(void *arg)
 {
 	job_t *job = (job_t *)arg;
 	sigset_t set;
+	static time_t last_intr = 0;
 	int signo;
 	struct sigaction action;
 
 
 	while (1) {
 		sigfillset(&set);
-		pthread_sigmask(SIG_UNBLOCK, &set, NULL);
+		pthread_sigmask(SIG_BLOCK, &set, NULL);
 		sigwait(&set, &signo);
 		debug2("recvd signal %d", signo);
 		switch (signo) {
-		  case SIGINT:
-			  fwd_signal(job, SIGINT);
-			  pthread_mutex_lock(&job->state_mutex);
-			  job->state = SRUN_JOB_OVERDONE;
-			  pthread_cond_signal(&job->state_cond);
-			  pthread_mutex_unlock(&job->state_mutex);
-			  break;
 		  case SIGTERM:
 			  pthread_exit(0);
 			  break;
+		  case SIGINT:
+			  if (time(NULL) - last_intr > 1) {
+				  info("sending Ctrl-C to remote tasks");
+				  last_intr = time(NULL);
+				  fwd_signal(job, signo);
+			  } else  { /* second Ctrl-C in half as many seconds */
+				    /* terminate job */
+				  info("forcing termination");
+				  pthread_mutex_lock(&job->state_mutex);
+				  job->state = SRUN_JOB_OVERDONE;
+				  pthread_cond_signal(&job->state_cond);
+				  pthread_mutex_unlock(&job->state_mutex);
+			  }
+			  break;
 		  default:
 			  fwd_signal(job, signo);
 			  break;
@@ -349,14 +359,14 @@ sig_thr(void *arg)
 	pthread_exit(0);
 }
 
-void 
+	void 
 fwd_signal(job_t *job, int signo)
 {
 	int i;
 	slurm_msg_t req;
 	slurm_msg_t resp;
 	kill_tasks_msg_t msg;
-	
+
 	debug("forward signal %d to job", signo);
 
 	req.msg_type = REQUEST_KILL_TASKS;
@@ -368,7 +378,7 @@ fwd_signal(job_t *job, int signo)
 
 	for (i = 0; i < job->nhosts; i++) {
 		slurm_set_addr_uint(&req.address, slurm_get_slurmd_port(),
-				    ntohl(job->iaddr[i]));
+				ntohl(job->iaddr[i]));
 		debug("sending kill req to %s", job->host[i]);
 		if (slurm_send_recv_node_msg(&req, &resp) < 0)
 			error("Unable to send signal to host %s", 
-- 
GitLab