diff --git a/NEWS b/NEWS index 2efec3f3179ad1eb6ae52a1a5291caeda02a20b4..d30c9c6c82ce2a596b03fbf2b90cd95a2142216c 100644 --- a/NEWS +++ b/NEWS @@ -95,6 +95,8 @@ documents those changes that are of interest to users and admins. passthrough nodes to the allocation when creating a block. - BLUEGENE - Fix deadlock issue with starting and failing jobs at the same time + - Make connect() non-blocking and poll() with timeout to avoid huge + waits under some conditions. * Changes in SLURM 1.1.17 ========================= diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index 6f4345d442c6cc56fb52e1c6431b47a97eb9cf8a..3bcd7d27dff018576b3315f8ceca17a87e8dc5ce 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -132,7 +132,7 @@ static slurm_errtab_t slurm_errtab[] = { { ESLURM_PATHNAME_TOO_LONG, "Pathname of a file or directory too long" }, { ESLURM_NOT_TOP_PRIORITY, - "Immediate execution impossible, higher priority jobs pending" }, + "Immediate execution impossible, insufficient priority" }, { ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE, "Requested node configuration is not available" }, { ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE, diff --git a/src/common/slurm_protocol_socket_implementation.c b/src/common/slurm_protocol_socket_implementation.c index c268ad55665d6195820b38b225973ad1417dd76a..d5586eb277e6193b245bb5915c59729646a3a60b 100644 --- a/src/common/slurm_protocol_socket_implementation.c +++ b/src/common/slurm_protocol_socket_implementation.c @@ -565,7 +565,7 @@ extern int _slurm_getsockname (int __fd, struct sockaddr * __addr, extern int _slurm_connect (int __fd, struct sockaddr const * __addr, socklen_t __len) { -#if 1 +#if 0 return connect ( __fd , __addr , __len ) ; #else /* From "man connect": Note that for IP sockets the timeout @@ -573,21 +573,23 @@ extern int _slurm_connect (int __fd, struct sockaddr const * __addr, * * Timeouts in excess of 3 minutes have been observed, resulting * in serious problems for slurmctld. Making the connect call - * non-blocking and polling seems to fix the problem on Linux. - * It fails on AIX. */ + * non-blocking and polling seems to fix the problem. */ int rc = -1, flags; flags = fcntl(__fd, F_GETFL); fcntl(__fd, F_SETFL, flags | O_NONBLOCK); rc = connect(__fd , __addr , __len); if ((rc == -1) && (errno == EINPROGRESS)) { + int poll_rc; struct pollfd ufds; ufds.fd = __fd; - ufds.events = POLLOUT; + ufds.events = POLLIN | POLLOUT; ufds.revents = 0; - poll(&ufds, 1, 5000); /* 5 sec max wait */ - if (ufds.revents == POLLOUT) - rc = connect(__fd , __addr , __len); + poll_rc = poll(&ufds, 1, 5000); + if (poll_rc == 0) + errno = ETIMEDOUT; + else if (poll_rc == 1) + rc = 0; } fcntl(__fd, F_SETFL, flags); return rc;