From 7dbf1ccda5aebea5b7c5417a0eeea7bcc6dc6f0e Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 10 Oct 2003 22:37:30 +0000 Subject: [PATCH] Make agent logic more robust in the face of communications errors. Send multiple SIGALRMs if needed and deal with possible abort of a thread. --- src/slurmctld/agent.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index bf97d61c7a7..3713eb61426 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -362,8 +362,9 @@ static void *_wdog(void *args) if (thread_ptr[i].end_time <= now) { debug3("agent thread %lu timed out\n", (unsigned long) thread_ptr[i].thread); - pthread_kill(thread_ptr[i].thread, - SIGALRM); + if (pthread_kill(thread_ptr[i].thread, + SIGALRM) == ESRCH) + thread_ptr[i].state = DSH_FAILED; } break; case DSH_NEW: @@ -612,12 +613,12 @@ static void *_thread_per_node_rpc(void *args) } /* - * SIGALRM handler. This is just a stub because we are really interested - * in interrupting connect() in k4cmd/rcmd or select() in rsh() below and - * causing them to return EINTR. + * SIGALRM handler. We are really interested in interrupting hung communictions + * and causing them to return EINTR. Multiple interupts might be required. */ static void _alarm_handler(int dummy) { + xsignal(SIGALRM, _alarm_handler); } -- GitLab