diff --git a/src/slurmd/common/reverse_tree.h b/src/slurmd/common/reverse_tree.h index 2ec05a91559617c770b83f5488b9a8163973c1b8..d4ddf9af10369bb9a8f51cfa4b144a64fecf8b22 100644 --- a/src/slurmd/common/reverse_tree.h +++ b/src/slurmd/common/reverse_tree.h @@ -29,6 +29,6 @@ #define REVERSE_TREE_WIDTH 2 #define REVERSE_TREE_CHILDREN_TIMEOUT 60 /* seconds */ -#define REVERSE_TREE_PARENT_RETRY 5 /* seconds */ +#define REVERSE_TREE_PARENT_RETRY 5 /* count, 1 sec per attempt */ #endif /* !_REVERSE_TREE_H */ diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index a4448405f064d504b4d4cb3e894113a464f4f192..e77486a474a67ae38a3e250df65f269ef58a06c4 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -510,6 +510,7 @@ _one_step_complete_msg(slurmd_job_t *job, int first, int last) step_complete_msg_t msg; int rc = -1; int retcode; + int i; msg.job_id = job->jobid; msg.job_step_id = job->stepid; @@ -545,21 +546,17 @@ _one_step_complete_msg(slurmd_job_t *job, int first, int last) debug3("Rank %d sending complete to rank %d, range %d to %d", step_complete.rank, step_complete.parent_rank, first, last); - retcode = slurm_send_recv_rc_msg_only_one(&req, &rc, 10); - if (retcode == SLURM_SUCCESS && rc == 0) - goto finished; - /* On error, pause then try sending to parent again. * The parent slurmstepd may just not have started yet, because * of the way that the launch message forwarding works. */ - sleep(REVERSE_TREE_PARENT_RETRY); - debug3("Rank %d retry sending complete to rank %d, range %d to %d", - step_complete.rank, step_complete.parent_rank, first, last); - retcode = slurm_send_recv_rc_msg_only_one(&req, &rc, 10); - if (retcode == SLURM_SUCCESS && rc == 0) - goto finished; - + for (i = 0; i < REVERSE_TREE_PARENT_RETRY; i++) { + if (i) + sleep(1); + retcode = slurm_send_recv_rc_msg_only_one(&req, &rc, 10); + if (retcode == SLURM_SUCCESS && rc == 0) + goto finished; + } /* on error AGAIN, send to the slurmctld instead */ debug3("Rank %d sending complete to slurmctld instead, range %d to %d", step_complete.rank, first, last);