From dba22ac2d0fd4e6a22b9971ec0e40819346cc5f7 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Mon, 4 Nov 2002 22:28:54 +0000
Subject: [PATCH] Don't send STDIN to non-connected tasks (not started or
 done). Enhanced error logging.

---
 src/srun/io.c  | 20 +++++++++++++++-----
 src/srun/job.c |  2 --
 src/srun/msg.c |  8 ++++----
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/srun/io.c b/src/srun/io.c
index 5e64acdc243..d58937fd251 100644
--- a/src/srun/io.c
+++ b/src/srun/io.c
@@ -217,6 +217,7 @@ _io_thr_poll(void *job_arg)
 				_do_task_output_poll(&map[i]);
 		}
 	}
+	xfree(fds);
 }
 
 static void 
@@ -233,8 +234,8 @@ static void
 	_set_iofds_nonblocking(job);
 
 	for (i = 0; i < opt.nprocs; i++) {
-		job->out[i] = -1; 
-		job->err[i] = -1;
+		job->out[i] = WAITING_FOR_IO; 
+		job->err[i] = WAITING_FOR_IO;
 	}
 
 	while (1) {
@@ -463,7 +464,7 @@ _readn(int fd, void *buf, size_t nbytes)
 static void
 _bcast_stdin(int fd, job_t *job)
 {
-	int i;
+	int i, disc=0;
 	char buf[IO_BUFSIZ];
 	size_t len;
 
@@ -476,8 +477,17 @@ _bcast_stdin(int fd, job_t *job)
 		}
 	}
 
-	for (i = 0; i < opt.nprocs; i++)
-		write(job->out[i], buf, len);
+	/* broadcast to every connected task */
+	for (i = 0; i < opt.nprocs; i++) {
+		if ((job->out[i] == WAITING_FOR_IO) || 
+		    (job->out[i] == IO_DONE)) 
+			disc++;
+		else
+			write(job->out[i], buf, len);
+	}
+	if (disc)
+		error("Stdin could not be sent to %d disconnected tasks", 
+		      disc);
 	return;
 }
 
diff --git a/src/srun/job.c b/src/srun/job.c
index f1edb8fc63a..266daa6de61 100644
--- a/src/srun/job.c
+++ b/src/srun/job.c
@@ -65,7 +65,6 @@ job_create(resource_allocation_response_msg_t *resp)
 		job->cred->signature[0] = 'a';
 
 		job->nodelist = xstrdup(opt.nodelist);
-debug("nodelist=%s",job->nodelist);
 		hl = hostlist_create(opt.nodelist);
 		srand48(getpid());
 		job->jobid = (uint32_t) (lrand48() % 65550L + 1L);
@@ -76,7 +75,6 @@ debug("nodelist=%s",job->nodelist);
 
 
 	job->nhosts = hostlist_count(hl);
-debug("nhosts=%d",job->nhosts);
 
 	job->host  = (char **) xmalloc(job->nhosts * sizeof(char *));
 	job->slurmd_addr = (slurm_addr *) xmalloc(job->nhosts * sizeof(slurm_addr));
diff --git a/src/srun/msg.c b/src/srun/msg.c
index 65df5205097..18d1f4e62b8 100644
--- a/src/srun/msg.c
+++ b/src/srun/msg.c
@@ -130,7 +130,7 @@ _accept_msg_connection(job_t *job, int fdnum)
 	char addrbuf[256];
 
 	if ((fd = slurm_accept_msg_conn(job->jfd[fdnum], &cli_addr)) < 0) {
-		error("slurm_accept_msg_conn: %m");
+		error("_accept_msg_connection/slurm_accept_msg_conn: %m");
 		return;
 	}
 
@@ -139,7 +139,7 @@ _accept_msg_connection(job_t *job, int fdnum)
 
 	msg = xmalloc(sizeof(*msg));
 	if (slurm_receive_msg(fd, msg) == SLURM_SOCKET_ERROR) {
-		error("slurm_receive_msg: %m");
+		error("_accept_msg_connection/slurm_receive_msg: %m");
 		return;
 	}
 
@@ -232,7 +232,7 @@ _msg_thr_one(void *arg)
 	while (1) {
 
 		if ((newfd = slurm_accept_msg_conn(fd, &cli_addr)) < 0) {
-			error("slurm_accept_msg_conn: rc=%d", errno);
+			error("_msg_thr_one/slurm_accept_msg_conn: %m");
 			break;
 		}
 
@@ -242,7 +242,7 @@ _msg_thr_one(void *arg)
 
 		msg = xmalloc(sizeof(*msg));
 		if (slurm_receive_msg(newfd, msg) == SLURM_SOCKET_ERROR) {
-			error("slurm_receive_msg: rc=%d\n", errno);
+			error("_msg_thr_one/slurm_receive_msg: %m");
 			break;
 		}
 
-- 
GitLab