Skip to content
Snippets Groups Projects
Commit e69cc078 authored by Moe Jette's avatar Moe Jette
Browse files

Add retry logic to socket connect() call from client which can fail

    when the slurmctld is under heavy load.
parent 8a315acf
No related branches found
No related tags found
No related merge requests found
...@@ -6,6 +6,8 @@ documents those changes that are of interest to users and admins. ...@@ -6,6 +6,8 @@ documents those changes that are of interest to users and admins.
-- Some updates to man page formatting from Gennaro Oliva, ICAR. -- Some updates to man page formatting from Gennaro Oliva, ICAR.
-- Smarter loading of plugins (doesn't stat every file in the plugin dir) -- Smarter loading of plugins (doesn't stat every file in the plugin dir)
-- In sched/backfill avoid trying to schedule jobs on DOWN or DRAINED nodes. -- In sched/backfill avoid trying to schedule jobs on DOWN or DRAINED nodes.
-- Add retry logic to socket connect() call from client which can fail
when the slurmctld is under heavy load.
* Changes in SLURM 1.3.3 * Changes in SLURM 1.3.3
======================== ========================
......
...@@ -1173,8 +1173,9 @@ slurm_fd slurm_open_msg_conn(slurm_addr * slurm_address) ...@@ -1173,8 +1173,9 @@ slurm_fd slurm_open_msg_conn(slurm_addr * slurm_address)
return _slurm_open_msg_conn(slurm_address); return _slurm_open_msg_conn(slurm_address);
} }
/* calls connect to make a connection-less datagram connection to the /* Calls connect to make a connection-less datagram connection to the
* primary or secondary slurmctld message engine * primary or secondary slurmctld message engine. If the controller
* is very busy the connect may fail, so retry a couple of times.
* OUT addr - address of controller contacted * OUT addr - address of controller contacted
* RET slurm_fd - file descriptor of the connection created * RET slurm_fd - file descriptor of the connection created
*/ */
...@@ -1182,29 +1183,39 @@ slurm_fd slurm_open_controller_conn(slurm_addr *addr) ...@@ -1182,29 +1183,39 @@ slurm_fd slurm_open_controller_conn(slurm_addr *addr)
{ {
slurm_fd fd; slurm_fd fd;
slurm_ctl_conf_t *conf; slurm_ctl_conf_t *conf;
int retry, have_backup = 0;
if (slurm_api_set_default_config() < 0) if (slurm_api_set_default_config() < 0)
return SLURM_FAILURE; return SLURM_FAILURE;
addr = &proto_conf->primary_controller;
if ((fd = slurm_open_msg_conn(&proto_conf->primary_controller)) >= 0)
return fd;
debug("Failed to contact primary controller: %m");
conf = slurm_conf_lock(); for (retry=0; retry<2; retry++) {
if (!conf->backup_controller) { if (retry)
slurm_conf_unlock(); sleep(1);
goto fail;
addr = &proto_conf->primary_controller;
fd = slurm_open_msg_conn(&proto_conf->primary_controller);
if (fd >= 0)
return fd;
debug("Failed to contact primary controller: %m");
if (retry == 0) {
conf = slurm_conf_lock();
if (conf->backup_controller)
have_backup = 1;
slurm_conf_unlock();
}
if (have_backup) {
addr = &proto_conf->secondary_controller;
fd = slurm_open_msg_conn(&proto_conf->
secondary_controller);
if (fd >= 0)
return fd;
debug("Failed to contact secondary controller: %m");
}
} }
slurm_conf_unlock();
addr = &proto_conf->secondary_controller;
if ((fd = slurm_open_msg_conn(&proto_conf->secondary_controller)) >= 0)
return fd;
addr = NULL; addr = NULL;
debug("Failed to contact secondary controller: %m");
fail:
slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR); slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR);
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment