Skip to content
Snippets Groups Projects
Commit 5a3b2baf authored by Moe Jette's avatar Moe Jette
Browse files

Update completion retry logic to try sending message 5 times with sleep(1) in

between rather than one retry with sleep(5) for better responsiveness.
parent f34eb780
No related branches found
No related tags found
No related merge requests found
......@@ -29,6 +29,6 @@
#define REVERSE_TREE_WIDTH 2
#define REVERSE_TREE_CHILDREN_TIMEOUT 60 /* seconds */
#define REVERSE_TREE_PARENT_RETRY 5 /* seconds */
#define REVERSE_TREE_PARENT_RETRY 5 /* count, 1 sec per attempt */
#endif /* !_REVERSE_TREE_H */
......@@ -510,6 +510,7 @@ _one_step_complete_msg(slurmd_job_t *job, int first, int last)
step_complete_msg_t msg;
int rc = -1;
int retcode;
int i;
msg.job_id = job->jobid;
msg.job_step_id = job->stepid;
......@@ -545,21 +546,17 @@ _one_step_complete_msg(slurmd_job_t *job, int first, int last)
debug3("Rank %d sending complete to rank %d, range %d to %d",
step_complete.rank, step_complete.parent_rank, first, last);
retcode = slurm_send_recv_rc_msg_only_one(&req, &rc, 10);
if (retcode == SLURM_SUCCESS && rc == 0)
goto finished;
/* On error, pause then try sending to parent again.
* The parent slurmstepd may just not have started yet, because
* of the way that the launch message forwarding works.
*/
sleep(REVERSE_TREE_PARENT_RETRY);
debug3("Rank %d retry sending complete to rank %d, range %d to %d",
step_complete.rank, step_complete.parent_rank, first, last);
retcode = slurm_send_recv_rc_msg_only_one(&req, &rc, 10);
if (retcode == SLURM_SUCCESS && rc == 0)
goto finished;
for (i = 0; i < REVERSE_TREE_PARENT_RETRY; i++) {
if (i)
sleep(1);
retcode = slurm_send_recv_rc_msg_only_one(&req, &rc, 10);
if (retcode == SLURM_SUCCESS && rc == 0)
goto finished;
}
/* on error AGAIN, send to the slurmctld instead */
debug3("Rank %d sending complete to slurmctld instead, range %d to %d",
step_complete.rank, first, last);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment