From fcbc74d2cc456429055ef42e96b90eb589d89249 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 3 Dec 2002 18:00:47 +0000
Subject: [PATCH] Restructure I/O timeout logic, mostly for I/O that completes
 long after the user's job.

---
 src/srun/io.c | 45 +++++++++++++++++----------------------------
 1 file changed, 17 insertions(+), 28 deletions(-)

diff --git a/src/srun/io.c b/src/srun/io.c
index 4b70a58cc77..7290ad5b0c2 100644
--- a/src/srun/io.c
+++ b/src/srun/io.c
@@ -50,8 +50,9 @@
 #include "src/srun/opt.h"
 
 #define IO_BUFSIZ		2048
-#define MAX_MSG_WAIT_SEC	  10	/* max wait to confirm launches, sec */
-#define MAX_IO_WAIT_SEC		 600	/* max I/O idle, secs, warning msg */
+#define MAX_MSG_WAIT_SEC	  30	/* max wait to confirm launches, sec */
+#define MAX_TERM_WAIT_SEC	  60	/* max time since first task 
+					 * terminated, secs, warning msg */
 #define POLL_TIMEOUT_MSEC	 500	/* max wait for i/o poll, msec */
 
 /* fd_info struct used in poll() loop to map fds back to task number,
@@ -65,7 +66,6 @@ typedef struct fd_info {
 } fd_info_t;
 
 static time_t time_first_done = 0;
-static time_t time_job_done = 0;
 static time_t time_last_io = 0;
 static time_t time_startup = 0;
 
@@ -197,25 +197,10 @@ _io_thr_poll(void *job_arg)
 				time_first_done = time(NULL);
 		}
 
-		if ((job->state == SRUN_JOB_OVERDONE) ||
-		    (job->state == SRUN_JOB_FAILED)) {
-			if (time_job_done == 0)
-				time_job_done = time(NULL);
-		}
-		if (time_job_done &&
-		    ((time(NULL) - time_job_done) > MAX_MSG_WAIT_SEC)) {
-			for (i = 0; i < opt.nprocs; i++) {
-				if ((job->out[i] == IO_DONE) && 
-				    (job->err[i] == IO_DONE))
-					continue;
-				error("Task %d on node %s terminated abnormally",
-				      i, _taskid2hostname(i, job));
-				update_job_state(job, SRUN_JOB_FAILED);
-			}
+		if (job->state == SRUN_JOB_FAILED)
 			pthread_exit(0);
-		}
 
-		while ((rc = poll(fds, nfds, POLL_TIMEOUT_MSEC)) < 0) {
+		while ((rc = poll(fds, nfds, POLL_TIMEOUT_MSEC)) <= 0) {
 			if (rc == 0) {	/* timeout */
 				_do_poll_timeout(job);
 				continue;
@@ -266,8 +251,8 @@ _io_thr_poll(void *job_arg)
 static void _do_poll_timeout (job_t *job)
 {
 	int i, j;
-	static bool no_io_msg_sent = false;
 	static bool check_all_start = false;
+	static bool term_msg_sent = false;
 
 	if ((check_all_start == false) &&
 	    ((time(NULL) - time_startup) > MAX_MSG_WAIT_SEC)) {
@@ -282,9 +267,14 @@ static void _do_poll_timeout (job_t *job)
 		}
 	}
 
+	for (i = 0; ((i < opt.nprocs) && (time_first_done == 0)); i++) {
+		if ((job->task_state[i] == SRUN_TASK_FAILED) || 
+		    (job->task_state[i] == SRUN_TASK_EXITED))
+			time_first_done = time(NULL);
+	}
+
 	i = time(NULL) - time_last_io;
 	j = time(NULL) - time_first_done;
-
 	if (job->state == SRUN_JOB_FAILED)
 		pthread_exit(0);
 	else if (time_first_done && opt.max_wait && (j > opt.max_wait)) {
@@ -293,11 +283,10 @@ static void _do_poll_timeout (job_t *job)
 		report_task_status(job);
 		update_job_state(job, SRUN_JOB_FAILED);
 		pthread_exit(0);
-	} else if (no_io_msg_sent)
-		;
-	else if (i > MAX_IO_WAIT_SEC) {
-		info("Warning: No I/O in %d seconds", MAX_IO_WAIT_SEC);
-		no_io_msg_sent = true; 
+	} else if (time_first_done && (term_msg_sent == false) && 
+		   (j > MAX_TERM_WAIT_SEC)) {
+		info("Warning: First task termination %d seconds ago", j);
+		term_msg_sent = true;
 	}
 }
 
@@ -337,7 +326,7 @@ void report_task_status(job_t *job)
 		current_state = job->task_state[i];
 		first_task = last_task = i;
 		for (j = (i+1); j < opt.nprocs; j++) {
-			if (current_state == job->task_state[i])
+			if (current_state == job->task_state[j])
 				last_task = j;
 			else
 				break;
-- 
GitLab