diff --git a/NEWS b/NEWS index 5c0c178deeaa8bfc15730c057f26c59207ab342f..bf0df8d69300995805a777bf214100fa7b58dc48 100644 --- a/NEWS +++ b/NEWS @@ -259,6 +259,10 @@ documents those changes that are of interest to users and administrators. non-standard location. -- Fix MemSpecLimit to explicitly require TaskPlugin=task/cgroup and ConstrainRAMSpace set in cgroup.conf. + -- MYSQL - Fix order of operations issue where if the database is locked up + and the slurmctld doesn't wait long enough for the response it would give + up leaving the connection open and create a situation where the next message + sent could receive the response of the first one. * Changes in Slurm 15.08.11 =========================== diff --git a/src/common/slurmdbd_defs.c b/src/common/slurmdbd_defs.c index 1a9704e0f4c17e8e99f39f241e95685d750bb536..da48ed0e65d80f8365d473a27176afab5d5846b8 100644 --- a/src/common/slurmdbd_defs.c +++ b/src/common/slurmdbd_defs.c @@ -1857,16 +1857,16 @@ static Buf _recv_msg(int read_timeout) return NULL; if (!_fd_readable(slurmdbd_fd, read_timeout)) - return NULL; + goto endit; msg_read = read(slurmdbd_fd, &nw_size, sizeof(nw_size)); if (msg_read != sizeof(nw_size)) - return NULL; + goto endit; msg_size = ntohl(nw_size); /* We don't error check for an upper limit here * since size could possibly be massive */ if (msg_size < 2) { error("slurmdbd: Invalid msg_size (%u)", msg_size); - return NULL; + goto endit; } msg = xmalloc(msg_size); @@ -1888,11 +1888,19 @@ static Buf _recv_msg(int read_timeout) offset, msg_size); } /* else in shutdown mode */ xfree(msg); - return NULL; + goto endit; } buffer = create_buf(msg, msg_size); return buffer; + +endit: + /* Close it since we abondoned it. If the connection does still exist + * on the other end we can't rely on it after this point since we didn't + * listen long enough for this response. + */ + _reopen_slurmdbd_fd(); + return NULL; } /* Return time in msec since "start time" */ diff --git a/src/slurmdbd/rpc_mgr.c b/src/slurmdbd/rpc_mgr.c index f449c063c44be159c2caf92c3efa6a03b2d29c31..567bf8e79076535397485319bff741402e48e935 100644 --- a/src/slurmdbd/rpc_mgr.c +++ b/src/slurmdbd/rpc_mgr.c @@ -246,7 +246,16 @@ static void * _service_connection(void *arg) fini = true; } - (void) _send_resp(conn->newsockfd, buffer); + if (_send_resp(conn->newsockfd, buffer) != SLURM_SUCCESS) { + /* This is only an issue on persistent connections, and + * really isn't that big of a deal as the slurmctld + * will just send the message again. */ + if (conn->ctld_port) + debug("Problem sending response to " + "connection %d(%s) uid(%d)", + conn->newsockfd, conn->ip, uid); + fini = true; + } xfree(msg); }