From 6e1e8c3e8a8ba5c0af9e885c6dc897c82079b62e Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Fri, 28 Jun 2013 08:22:32 -0700 Subject: [PATCH] Srun to keep running after receiving unrecognized message This can happen if something outside of Slurm opens the srun socket and writes to it, since the data will not be of a form that Slurm can decode. Bug 354 --- src/srun/libsrun/allocate.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/srun/libsrun/allocate.c b/src/srun/libsrun/allocate.c index 4fabd66a719..0d5fc520f6e 100644 --- a/src/srun/libsrun/allocate.c +++ b/src/srun/libsrun/allocate.c @@ -207,6 +207,16 @@ static bool _retry(void) slurm_strerror(ESLURM_NODES_BUSY)); error_exit = immediate_exit; return false; + } else if ((errno == SLURM_PROTOCOL_AUTHENTICATION_ERROR) || + (errno == SLURM_UNEXPECTED_MSG_ERROR) || + (errno == SLURM_PROTOCOL_INSANE_MSG_LENGTH)) { + static int external_msg_count = 0; + error("Srun communication socket apparently being written to " + "by something other than Slurm"); + if (external_msg_count++ < 4) + return true; + error("Unable to allocate resources: %m"); + return false; } else { error("Unable to allocate resources: %m"); return false; -- GitLab