From 7dbf1ccda5aebea5b7c5417a0eeea7bcc6dc6f0e Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 10 Oct 2003 22:37:30 +0000
Subject: [PATCH] Make agent logic more robust in the face of communications
 errors. Send multiple SIGALRMs if needed and deal with possible abort of a
 thread.

---
 src/slurmctld/agent.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c
index bf97d61c7a7..3713eb61426 100644
--- a/src/slurmctld/agent.c
+++ b/src/slurmctld/agent.c
@@ -362,8 +362,9 @@ static void *_wdog(void *args)
 				if (thread_ptr[i].end_time <= now) {
 					debug3("agent thread %lu timed out\n", 
 					       (unsigned long) thread_ptr[i].thread);
-					pthread_kill(thread_ptr[i].thread,
-						     SIGALRM);
+					if (pthread_kill(thread_ptr[i].thread,
+						     SIGALRM) == ESRCH)
+						thread_ptr[i].state = DSH_FAILED;
 				}
 				break;
 			case DSH_NEW:
@@ -612,12 +613,12 @@ static void *_thread_per_node_rpc(void *args)
 }
 
 /*
- * SIGALRM handler.  This is just a stub because we are really interested
- * in interrupting connect() in k4cmd/rcmd or select() in rsh() below and
- * causing them to return EINTR.
+ * SIGALRM handler.  We are really interested in interrupting hung communictions 
+ * and causing them to return EINTR. Multiple interupts might be required.
  */
 static void _alarm_handler(int dummy)
 {
+	xsignal(SIGALRM, _alarm_handler);
 }
 
 
-- 
GitLab