diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index f0485e490eaad192b3efb93687dea78b1ebcb57b..4624d5f799523084b9a5d0347ac6718723b5879b 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -4284,7 +4284,10 @@ static void _suspend_job(struct job_record *job_ptr, uint16_t op) agent_args = xmalloc(sizeof(agent_arg_t)); agent_args->msg_type = REQUEST_SUSPEND; - agent_args->retry = 1; + agent_args->retry = 0; /* don't resend or gang schedulers + * (sched/gang or sched/wiki) can + * can quickly induce huge backlog + * of agent.c RPCs */ agent_args->hostlist = hostlist_create(""); sus_ptr = xmalloc(sizeof(suspend_msg_t)); sus_ptr->job_id = job_ptr->job_id; diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 3c3891dc408b48d2bbe13f8a5421cf7a5caad86a..686c4808ed8c1cada2ba39dc1964a450442bf185 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -182,7 +182,7 @@ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, agent_args->msg_type = REQUEST_KILL_TIMELIMIT; else agent_args->msg_type = REQUEST_TERMINATE_JOB; - agent_args->retry = 1; + agent_args->retry = 0; /* re_kill_job() resends as needed */ agent_args->hostlist = hostlist_create(""); kill_job = xmalloc(sizeof(kill_job_msg_t)); last_node_update = time(NULL);