From a495c553dabb847e533418df412e408b9af84201 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 26 Oct 2006 23:16:12 +0000 Subject: [PATCH] svn merge -r9938:9941 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1 --- NEWS | 2 ++ src/common/slurm_errno.c | 2 +- .../slurm_protocol_socket_implementation.c | 16 +++++++++------- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/NEWS b/NEWS index 2efec3f3179..d30c9c6c82c 100644 --- a/NEWS +++ b/NEWS @@ -95,6 +95,8 @@ documents those changes that are of interest to users and admins. passthrough nodes to the allocation when creating a block. - BLUEGENE - Fix deadlock issue with starting and failing jobs at the same time + - Make connect() non-blocking and poll() with timeout to avoid huge + waits under some conditions. * Changes in SLURM 1.1.17 ========================= diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index 6f4345d442c..3bcd7d27dff 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -132,7 +132,7 @@ static slurm_errtab_t slurm_errtab[] = { { ESLURM_PATHNAME_TOO_LONG, "Pathname of a file or directory too long" }, { ESLURM_NOT_TOP_PRIORITY, - "Immediate execution impossible, higher priority jobs pending" }, + "Immediate execution impossible, insufficient priority" }, { ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE, "Requested node configuration is not available" }, { ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE, diff --git a/src/common/slurm_protocol_socket_implementation.c b/src/common/slurm_protocol_socket_implementation.c index c268ad55665..d5586eb277e 100644 --- a/src/common/slurm_protocol_socket_implementation.c +++ b/src/common/slurm_protocol_socket_implementation.c @@ -565,7 +565,7 @@ extern int _slurm_getsockname (int __fd, struct sockaddr * __addr, extern int _slurm_connect (int __fd, struct sockaddr const * __addr, socklen_t __len) { -#if 1 +#if 0 return connect ( __fd , __addr , __len ) ; #else /* From "man connect": Note that for IP sockets the timeout @@ -573,21 +573,23 @@ extern int _slurm_connect (int __fd, struct sockaddr const * __addr, * * Timeouts in excess of 3 minutes have been observed, resulting * in serious problems for slurmctld. Making the connect call - * non-blocking and polling seems to fix the problem on Linux. - * It fails on AIX. */ + * non-blocking and polling seems to fix the problem. */ int rc = -1, flags; flags = fcntl(__fd, F_GETFL); fcntl(__fd, F_SETFL, flags | O_NONBLOCK); rc = connect(__fd , __addr , __len); if ((rc == -1) && (errno == EINPROGRESS)) { + int poll_rc; struct pollfd ufds; ufds.fd = __fd; - ufds.events = POLLOUT; + ufds.events = POLLIN | POLLOUT; ufds.revents = 0; - poll(&ufds, 1, 5000); /* 5 sec max wait */ - if (ufds.revents == POLLOUT) - rc = connect(__fd , __addr , __len); + poll_rc = poll(&ufds, 1, 5000); + if (poll_rc == 0) + errno = ETIMEDOUT; + else if (poll_rc == 1) + rc = 0; } fcntl(__fd, F_SETFL, flags); return rc; -- GitLab