From a495c553dabb847e533418df412e408b9af84201 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Thu, 26 Oct 2006 23:16:12 +0000
Subject: [PATCH] svn merge -r9938:9941
 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1

---
 NEWS                                             |  2 ++
 src/common/slurm_errno.c                         |  2 +-
 .../slurm_protocol_socket_implementation.c       | 16 +++++++++-------
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/NEWS b/NEWS
index 2efec3f3179..d30c9c6c82c 100644
--- a/NEWS
+++ b/NEWS
@@ -95,6 +95,8 @@ documents those changes that are of interest to users and admins.
    passthrough nodes to the allocation when creating a block. 
  - BLUEGENE - Fix deadlock issue with starting and failing jobs at the same
    time
+ - Make connect() non-blocking and poll() with timeout to avoid huge 
+   waits under some conditions.
 
 * Changes in SLURM 1.1.17
 =========================
diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c
index 6f4345d442c..3bcd7d27dff 100644
--- a/src/common/slurm_errno.c
+++ b/src/common/slurm_errno.c
@@ -132,7 +132,7 @@ static slurm_errtab_t slurm_errtab[] = {
 	{ ESLURM_PATHNAME_TOO_LONG,
 	  "Pathname of a file or directory too long"   		},
 	{ ESLURM_NOT_TOP_PRIORITY,
-	  "Immediate execution impossible, higher priority jobs pending" },
+	  "Immediate execution impossible, insufficient priority" },
 	{ ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE,
 	  "Requested node configuration is not available"	},
 	{ ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE,
diff --git a/src/common/slurm_protocol_socket_implementation.c b/src/common/slurm_protocol_socket_implementation.c
index c268ad55665..d5586eb277e 100644
--- a/src/common/slurm_protocol_socket_implementation.c
+++ b/src/common/slurm_protocol_socket_implementation.c
@@ -565,7 +565,7 @@ extern int _slurm_getsockname (int __fd, struct sockaddr * __addr,
 extern int _slurm_connect (int __fd, struct sockaddr const * __addr, 
                                 socklen_t __len)
 {
-#if 1
+#if 0
 	return connect ( __fd , __addr , __len ) ;
 #else
 	/* From "man connect": Note that for IP sockets the timeout
@@ -573,21 +573,23 @@ extern int _slurm_connect (int __fd, struct sockaddr const * __addr,
 	 *
 	 * Timeouts in excess of 3 minutes have been observed, resulting
 	 * in serious problems for slurmctld. Making the connect call 
-	 * non-blocking and polling seems to fix the problem on Linux. 
-	 * It fails on AIX. */
+	 * non-blocking and polling seems to fix the problem. */
 	int rc = -1, flags;
 
 	flags = fcntl(__fd, F_GETFL);
 	fcntl(__fd, F_SETFL, flags | O_NONBLOCK);
 	rc = connect(__fd , __addr , __len);
 	if ((rc == -1) && (errno == EINPROGRESS)) {
+		int poll_rc;
 		struct pollfd ufds;
 		ufds.fd = __fd;
-		ufds.events = POLLOUT;
+		ufds.events = POLLIN | POLLOUT;
 		ufds.revents = 0;
-		poll(&ufds, 1, 5000);   /* 5 sec max wait */
-		if (ufds.revents == POLLOUT)
-			rc = connect(__fd , __addr , __len);
+		poll_rc = poll(&ufds, 1, 5000);
+		if (poll_rc == 0)
+                        errno = ETIMEDOUT;
+		else if (poll_rc == 1)
+			rc = 0;
 	}
 	fcntl(__fd, F_SETFL, flags);
 	return rc;
-- 
GitLab