diff --git a/NEWS b/NEWS index 7c865e67bd0ce57fc469e69a58d77fa33fde117e..5a169affc16e1eacbb8e67c1ce9837f301de29b7 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,8 @@ documents those changes that are of interest to users and admins. -- Some updates to man page formatting from Gennaro Oliva, ICAR. -- Smarter loading of plugins (doesn't stat every file in the plugin dir) -- In sched/backfill avoid trying to schedule jobs on DOWN or DRAINED nodes. + -- Add retry logic to socket connect() call from client which can fail + when the slurmctld is under heavy load. * Changes in SLURM 1.3.3 ======================== diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index c83899b24826d1c29dbb5b0d778122b0071a5512..3ff66163f9144d609ee72eb71b94354128d2b3da 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -1173,8 +1173,9 @@ slurm_fd slurm_open_msg_conn(slurm_addr * slurm_address) return _slurm_open_msg_conn(slurm_address); } -/* calls connect to make a connection-less datagram connection to the - * primary or secondary slurmctld message engine +/* Calls connect to make a connection-less datagram connection to the + * primary or secondary slurmctld message engine. If the controller + * is very busy the connect may fail, so retry a couple of times. * OUT addr - address of controller contacted * RET slurm_fd - file descriptor of the connection created */ @@ -1182,29 +1183,39 @@ slurm_fd slurm_open_controller_conn(slurm_addr *addr) { slurm_fd fd; slurm_ctl_conf_t *conf; + int retry, have_backup = 0; if (slurm_api_set_default_config() < 0) return SLURM_FAILURE; - addr = &proto_conf->primary_controller; - if ((fd = slurm_open_msg_conn(&proto_conf->primary_controller)) >= 0) - return fd; - - debug("Failed to contact primary controller: %m"); - conf = slurm_conf_lock(); - if (!conf->backup_controller) { - slurm_conf_unlock(); - goto fail; + for (retry=0; retry<2; retry++) { + if (retry) + sleep(1); + + addr = &proto_conf->primary_controller; + fd = slurm_open_msg_conn(&proto_conf->primary_controller); + if (fd >= 0) + return fd; + debug("Failed to contact primary controller: %m"); + + if (retry == 0) { + conf = slurm_conf_lock(); + if (conf->backup_controller) + have_backup = 1; + slurm_conf_unlock(); + } + + if (have_backup) { + addr = &proto_conf->secondary_controller; + fd = slurm_open_msg_conn(&proto_conf-> + secondary_controller); + if (fd >= 0) + return fd; + debug("Failed to contact secondary controller: %m"); + } } - slurm_conf_unlock(); - addr = &proto_conf->secondary_controller; - if ((fd = slurm_open_msg_conn(&proto_conf->secondary_controller)) >= 0) - return fd; addr = NULL; - debug("Failed to contact secondary controller: %m"); - - fail: slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR); }