diff --git a/NEWS b/NEWS index 6aedea181b2fa5779a226bfbf600cf79de5b900d..4a78b8a588ac7d88fab6102bc5d1894fcee6d85f 100644 --- a/NEWS +++ b/NEWS @@ -482,6 +482,8 @@ documents those changes that are of interest to users and admins. including deletion of all job steps (Chris Holmes, HP, slurm,patch). -- Recognize ISO-8859 input to srun as a script (for non-English scripts). -- switch/elan: Fix bug in propagation of ELAN_STATKEY environment variable. + -- Fix bug in slurmstepd IO code that can result in it spinning if a + certain error occurs. * Changes in SLURM 1.0.15 ========================= diff --git a/src/common/eio.c b/src/common/eio.c index aecaf72a33693c2a1a2b43f0f66ecd74b887ea7d..4a7b28b983c5736f7255bae92dd3cfe2ede7064c 100644 --- a/src/common/eio.c +++ b/src/common/eio.c @@ -277,20 +277,11 @@ _poll_dispatch(struct pollfd *pfds, unsigned int nfds, eio_obj_t *map[], List objList) { int i; - ListIterator iter; - eio_obj_t *obj; for (i = 0; i < nfds; i++) { if (pfds[i].revents > 0) _poll_handle_event(pfds[i].revents, map[i], objList); } - - iter = list_iterator_create(objList); - while ((obj = list_next(iter))) { - if (_is_writable(obj) && obj->ops->handle_write) - (*obj->ops->handle_write) (obj, objList); - } - list_iterator_destroy(iter); } static void diff --git a/src/common/slurm_protocol_socket_implementation.c b/src/common/slurm_protocol_socket_implementation.c index 2aea2727a54016378bbf40237545d59416bb331a..f435327a60df70ddc4156a5d4c4793059d1ce081 100644 --- a/src/common/slurm_protocol_socket_implementation.c +++ b/src/common/slurm_protocol_socket_implementation.c @@ -586,16 +586,16 @@ extern int _slurm_connect (int __fd, struct sockaddr const * __addr, ufds.events = POLLIN | POLLOUT; ufds.revents = 0; poll_rc = poll(&ufds, 1, 5000); - if (poll_rc == 0) - errno = ETIMEDOUT; - else if (poll_rc == 1) { + if (poll_rc == 1) { /* poll successfully completed */ if (ufds.revents & POLLERR) debug2("connect failure"); else rc = 0; - } else - error("poll: %m"); + } else { + errno = ETIMEDOUT; + debug2("poll: %m"); + } } fcntl(__fd, F_SETFL, flags); return rc; diff --git a/src/slurmd/slurmstepd/io.c b/src/slurmd/slurmstepd/io.c index 48c1edc27540ef04dec07aa05b6b7ecb33a01cb1..57f4735fdabfd9ea3148e9edd6d8fb3a00bbcb39 100644 --- a/src/slurmd/slurmstepd/io.c +++ b/src/slurmd/slurmstepd/io.c @@ -187,6 +187,12 @@ _client_readable(eio_obj_t *obj) if (client->in_eof) { debug5(" false"); + /* We no longer want the _client_read() function to handle + errors on write now that the read side of the connection + is closed. Setting handle_read to NULL will result in + the _client_write function handling errors, and closing + down the write end of the connection. */ + obj->ops->handle_read = NULL; return false; } @@ -194,6 +200,7 @@ _client_readable(eio_obj_t *obj) debug5(" false, shutdown"); shutdown(obj->fd, SHUT_RD); client->in_eof = true; + return false; } if (client->in_msg != NULL @@ -395,22 +402,16 @@ _client_write(eio_obj_t *obj, List objs) buf = client->out_msg->data + (client->out_msg->length - client->out_remaining); again: if ((n = write(obj->fd, buf, client->out_remaining)) < 0) { - if (errno == EINTR) + if (errno == EINTR) { goto again; - if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) { + } else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) { debug5("_client_write returned EAGAIN"); return SLURM_SUCCESS; - } - if (errno == EPIPE) { + } else { client->out_eof = true; _free_all_outgoing_msgs(client->msg_queue, client->job); return SLURM_SUCCESS; } - /* Bad place for an error message, because it might try to write - to exectly the fd that is having trouble, and hang. - Not sure why it hangs... */ - /* error("Get error on write() in _client_write: %m"); */ - return SLURM_SUCCESS; } debug5("Wrote %d bytes to socket", n); client->out_remaining -= n; @@ -1146,12 +1147,18 @@ _send_eof_msg(struct task_read_info *out) Buf packbuf; debug4("Entering _send_eof_msg"); + out->eof_msg_sent = true; if (_outgoing_buf_free(out->job)) { msg = list_dequeue(out->job->free_outgoing); } else { - debug5(" free_outgoing msg list empty, can't send eof_msg"); - return; + /* eof message must be allowed to allocate new memory + because _task_readable() will return "true" until + the eof message is enqueued. For instance, if + a poll returns POLLHUP on the incoming task pipe, + put there are no outgoing message buffers available, + the slurmstepd will start spinning. */ + msg = alloc_io_buf(); } header.type = out->type; @@ -1175,7 +1182,6 @@ _send_eof_msg(struct task_read_info *out) } list_iterator_destroy(clients); - out->eof_msg_sent = true; debug4("Leaving _send_eof_msg"); } diff --git a/src/srun/opt.c b/src/srun/opt.c index 9b359e916be37b70dc0821520062c0cdf5e974d4..54c363eec2872bb0a216fa0429c588f253e8c353 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -90,7 +90,6 @@ #define OPT_NONE 0x00 #define OPT_INT 0x01 #define OPT_STRING 0x02 -#define OPT_DEBUG 0x03 #define OPT_DISTRIB 0x04 #define OPT_NODES 0x05 #define OPT_OVERCOMMIT 0x06 @@ -1050,7 +1049,6 @@ env_vars_t env_vars[] = { {"SLURM_CORE_FORMAT", OPT_CORE, NULL, NULL }, {"SLURM_CPU_BIND", OPT_CPU_BIND, NULL, NULL }, {"SLURM_MEM_BIND", OPT_MEM_BIND, NULL, NULL }, - {"SLURM_DEBUG", OPT_DEBUG, NULL, NULL }, {"SLURM_DISTRIBUTION", OPT_DISTRIB, NULL, NULL }, {"SLURM_GEOMETRY", OPT_GEOMETRY, NULL, NULL }, {"SLURM_IMMEDIATE", OPT_INT, &opt.immediate, NULL }, @@ -1126,14 +1124,6 @@ _process_env_var(env_vars_t *e, const char *val) } break; - case OPT_DEBUG: - if (val != NULL) { - _verbose = (int) strtol(val, &end, 10); - if (!(end && *end == '\0')) - error("%s=%s invalid", e->var, val); - } - break; - case OPT_DISTRIB: if (strcmp(val, "unknown") == 0) break; /* ignore it, passed from salloc */ diff --git a/src/srun/srun.c b/src/srun/srun.c index c2c96e06c33505d65c1e94dcfa3114d47afb3ad0..40fe8458231436ede3bbf46cb1987d646120d359 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -167,7 +167,11 @@ int srun(int ac, char **av) /* reinit log with new verbosity (if changed by command line) */ if (_verbose || opt.quiet) { - logopt.stderr_level += _verbose; + /* If log level is already increased, only increment the + * level to the difference of _verbose an LOG_LEVEL_INFO + */ + if ((_verbose -= (logopt.stderr_level - LOG_LEVEL_INFO)) > 0) + logopt.stderr_level += _verbose; logopt.stderr_level -= opt.quiet; logopt.prefix_level = 1; log_alter(logopt, 0, NULL);