From e69cc0783b564fb0bf6dd656e656db75695fa6c3 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 6 Jun 2008 18:05:35 +0000 Subject: [PATCH] Add retry logic to socket connect() call from client which can fail when the slurmctld is under heavy load. --- NEWS | 2 ++ src/common/slurm_protocol_api.c | 47 ++++++++++++++++++++------------- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/NEWS b/NEWS index 7c865e67bd0..5a169affc16 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,8 @@ documents those changes that are of interest to users and admins. -- Some updates to man page formatting from Gennaro Oliva, ICAR. -- Smarter loading of plugins (doesn't stat every file in the plugin dir) -- In sched/backfill avoid trying to schedule jobs on DOWN or DRAINED nodes. + -- Add retry logic to socket connect() call from client which can fail + when the slurmctld is under heavy load. * Changes in SLURM 1.3.3 ======================== diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index c83899b2482..3ff66163f91 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -1173,8 +1173,9 @@ slurm_fd slurm_open_msg_conn(slurm_addr * slurm_address) return _slurm_open_msg_conn(slurm_address); } -/* calls connect to make a connection-less datagram connection to the - * primary or secondary slurmctld message engine +/* Calls connect to make a connection-less datagram connection to the + * primary or secondary slurmctld message engine. If the controller + * is very busy the connect may fail, so retry a couple of times. * OUT addr - address of controller contacted * RET slurm_fd - file descriptor of the connection created */ @@ -1182,29 +1183,39 @@ slurm_fd slurm_open_controller_conn(slurm_addr *addr) { slurm_fd fd; slurm_ctl_conf_t *conf; + int retry, have_backup = 0; if (slurm_api_set_default_config() < 0) return SLURM_FAILURE; - addr = &proto_conf->primary_controller; - if ((fd = slurm_open_msg_conn(&proto_conf->primary_controller)) >= 0) - return fd; - - debug("Failed to contact primary controller: %m"); - conf = slurm_conf_lock(); - if (!conf->backup_controller) { - slurm_conf_unlock(); - goto fail; + for (retry=0; retry<2; retry++) { + if (retry) + sleep(1); + + addr = &proto_conf->primary_controller; + fd = slurm_open_msg_conn(&proto_conf->primary_controller); + if (fd >= 0) + return fd; + debug("Failed to contact primary controller: %m"); + + if (retry == 0) { + conf = slurm_conf_lock(); + if (conf->backup_controller) + have_backup = 1; + slurm_conf_unlock(); + } + + if (have_backup) { + addr = &proto_conf->secondary_controller; + fd = slurm_open_msg_conn(&proto_conf-> + secondary_controller); + if (fd >= 0) + return fd; + debug("Failed to contact secondary controller: %m"); + } } - slurm_conf_unlock(); - addr = &proto_conf->secondary_controller; - if ((fd = slurm_open_msg_conn(&proto_conf->secondary_controller)) >= 0) - return fd; addr = NULL; - debug("Failed to contact secondary controller: %m"); - - fail: slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR); } -- GitLab