diff --git a/NEWS b/NEWS index dbc8ca12bc256f0915571ece6158bd641df49aff..631b475450cca5e9709e8e19be4357a6884d65da 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,8 @@ documents those changes that are of interest to users and admins. ============================= -- BlueGene srun --geometry was not getting propogated properly. -- Fix race condition with multiple simultaneous epilogs. + -- Modify slurmd to resend job completion RPC to slurmctld in the + case where slurmctld is not responding. * Changes in SLURM 0.5.0-pre7 ============================= diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index f1d840fce73102d1f11f7bbeb056add326e12e9f..c63050da771663befc86c53f840d265d8c746009 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -93,6 +93,9 @@ static int exit_errno[] = #define MAX_SMGR_EXIT_STATUS 6 +#define RETRY_DELAY 15 /* retry every 15 seconds */ +#define MAX_RETRY 240 /* retry 240 times (one hour max) */ + /* * List of signals to block in this process */ @@ -1131,7 +1134,7 @@ _send_launch_resp(slurmd_job_t *job, int rc) static int _complete_job(uint32_t jobid, uint32_t stepid, int err, int status) { - int rc; + int rc, i; slurm_msg_t req_msg; complete_job_step_msg_t req; @@ -1143,12 +1146,22 @@ _complete_job(uint32_t jobid, uint32_t stepid, int err, int status) req_msg.msg_type= REQUEST_COMPLETE_JOB_STEP; req_msg.data = &req; - if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0) { + /* Note: these log messages don't go to slurmd.log from here */ + for (i=0; i<=MAX_RETRY; i++) { + if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) >= 0) + break; + info("Retrying job complete RPC for %u.%u", jobid, stepid); + sleep(RETRY_DELAY); + } + if (i > MAX_RETRY) { error("Unable to send job complete message: %m"); return SLURM_ERROR; } - if (rc) slurm_seterrno_ret(rc); + if ((rc == ESLURM_ALREADY_DONE) || (rc == ESLURM_INVALID_JOB_ID)) + rc = SLURM_SUCCESS; + if (rc) + slurm_seterrno_ret(rc); return SLURM_SUCCESS; }