From c06d45ead984e06ee6d1353713658f3bdd1fc96d Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 8 Jul 2003 22:43:35 +0000 Subject: [PATCH] Added new error codes for commununication failures for message traffic specifically to slurm controller. --- slurm/slurm_errno.h | 6 ++ src/api/allocate.c | 6 +- src/api/reconfigure.c | 8 +-- src/common/slurm_errno.c | 9 +++ src/common/slurm_protocol_api.c | 61 +++++++++++++++---- .../slurm_protocol_socket_implementation.c | 50 ++++++++------- 6 files changed, 100 insertions(+), 40 deletions(-) diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h index ee5e6eb3742..dfbfce5af98 100644 --- a/slurm/slurm_errno.h +++ b/slurm/slurm_errno.h @@ -84,6 +84,12 @@ enum { SLURM_PROTOCOL_AUTHENTICATION_ERROR, SLURM_PROTOCOL_INSANE_MSG_LENGTH, + /* communication failures to/from slurmctld */ + SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR = 1800, + SLURMCTLD_COMMUNICATIONS_SEND_ERROR, + SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR, + SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR, + /* _info.c/communcation layer RESPONSE_SLURM_RC message codes */ SLURM_NO_CHANGE_IN_DATA = 1900, diff --git a/src/api/allocate.c b/src/api/allocate.c index 24107cc0bd1..0895109f3f3 100644 --- a/src/api/allocate.c +++ b/src/api/allocate.c @@ -87,7 +87,7 @@ slurm_allocate_resources (job_desc_msg_t *req, req->alloc_node = NULL; if (rc == SLURM_SOCKET_ERROR) - slurm_seterrno_ret(SLURM_COMMUNICATIONS_SEND_ERROR); + return SLURM_SOCKET_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: @@ -125,7 +125,7 @@ int slurm_job_will_run (job_desc_msg_t *req, req_msg.data = req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) - slurm_seterrno_ret(SLURM_COMMUNICATIONS_SEND_ERROR); + return SLURM_SOCKET_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: @@ -181,7 +181,7 @@ slurm_allocate_resources_and_run (job_desc_msg_t *req, req->alloc_node = NULL; if (rc == SLURM_SOCKET_ERROR) - slurm_seterrno_ret(SLURM_COMMUNICATIONS_SEND_ERROR); + return SLURM_SOCKET_ERROR; switch (resp_msg.msg_type) { diff --git a/src/api/reconfigure.c b/src/api/reconfigure.c index ca88fbefc00..37f027563cf 100644 --- a/src/api/reconfigure.c +++ b/src/api/reconfigure.c @@ -121,16 +121,16 @@ _send_message_controller (enum controller_id dest, slurm_msg_t *req) slurm_msg_t resp_msg ; if ((fd = slurm_open_controller_conn_spec(dest)) < 0) - slurm_seterrno_ret(SLURM_COMMUNICATIONS_CONNECTION_ERROR); + slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR); if (slurm_send_node_msg(fd, req) < 0) - slurm_seterrno_ret(SLURM_COMMUNICATIONS_SEND_ERROR); + slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_SEND_ERROR); if ((rc = slurm_receive_msg(fd, &resp_msg, 0)) < 0) - slurm_seterrno_ret(SLURM_COMMUNICATIONS_RECEIVE_ERROR); + slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR); if (slurm_shutdown_msg_conn(fd) != SLURM_SUCCESS) - slurm_seterrno_ret(SLURM_COMMUNICATIONS_SHUTDOWN_ERROR); + slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR); if (resp_msg.msg_type != RESPONSE_SLURM_RC) slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index f280e5f2d8c..31dcabf501c 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -72,6 +72,15 @@ static slurm_errtab_t slurm_errtab[] = { { SLURM_PROTOCOL_INSANE_MSG_LENGTH, "Insane message length" }, + /* communication failures to/from slurmctld */ + { SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR, + "Unable to contact slurm controller (connect failure)" }, + { SLURMCTLD_COMMUNICATIONS_SEND_ERROR, + "Unable to contact slurm controller (send failure)" }, + { SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR, + "Unable to contact slurm controller (receive failure)" }, + { SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR, + "Unable to contact slurm controller (shutdown failure)"}, /* _info.c/communcation layer RESPONSE_SLURM_RC message codes */ diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 26c82cbecc6..16d4751ed48 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -65,6 +65,9 @@ static slurm_protocol_config_t proto_conf_default; static slurm_protocol_config_t *proto_conf = &proto_conf_default; static slurm_ctl_conf_t slurmctld_conf; +/* STATIC FUNCTIONS */ +static void _remap_slurmctld_errno(void); + /**********************************************************************\ * protocol configuration functions \**********************************************************************/ @@ -170,6 +173,21 @@ uint16_t slurm_get_wait_time(void) return slurmctld_conf.wait_time; } +/* Change general slurm communication errors to slurmctld specific errors */ +static void _remap_slurmctld_errno(void) +{ + int err = slurm_get_errno(); + + if (err == SLURM_COMMUNICATIONS_CONNECTION_ERROR) + slurm_seterrno(SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR); + else if (err == SLURM_COMMUNICATIONS_SEND_ERROR) + slurm_seterrno(SLURMCTLD_COMMUNICATIONS_SEND_ERROR); + else if (err == SLURM_COMMUNICATIONS_RECEIVE_ERROR) + slurm_seterrno(SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR); + else if (err == SLURM_COMMUNICATIONS_SHUTDOWN_ERROR) + slurm_seterrno(SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR); +} + /**********************************************************************\ * general message management functions used by slurmctld, slurmd \**********************************************************************/ @@ -208,7 +226,10 @@ slurm_fd slurm_init_msg_engine(slurm_addr *addr) */ int slurm_shutdown_msg_engine(slurm_fd fd) { - return _slurm_close(fd); + int rc = _slurm_close(fd); + if (rc) + slurm_seterrno(SLURM_COMMUNICATIONS_SHUTDOWN_ERROR); + return rc; } /* @@ -265,7 +286,7 @@ slurm_fd slurm_open_controller_conn() debug("Failed to contact secondary controller: %m"); fail: - slurm_seterrno_ret(SLURM_COMMUNICATIONS_CONNECTION_ERROR); + slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR); } /* calls connect to make a connection-less datagram connection to the @@ -276,6 +297,7 @@ slurm_fd slurm_open_controller_conn() slurm_fd slurm_open_controller_conn_spec(enum controller_id dest) { slurm_addr *addr; + slurm_fd rc; if (slurm_api_set_default_config() < 0) { debug3("Error: Unable to set default config"); @@ -288,7 +310,10 @@ slurm_fd slurm_open_controller_conn_spec(enum controller_id dest) if (!addr) return SLURM_ERROR; - return slurm_open_msg_conn(addr); + rc = slurm_open_msg_conn(addr); + if (rc == -1) + _remap_slurmctld_errno(); + return rc; } /* In the bsd implmentation maps directly to a accept call @@ -781,10 +806,12 @@ _send_and_recv_msg(slurm_fd fd, slurm_msg_t *req, slurm_msg_t *resp, int slurm_send_recv_controller_msg(slurm_msg_t *req, slurm_msg_t *resp) { slurm_fd fd = -1; - int rc; + int rc = SLURM_SUCCESS; - if ((fd = slurm_open_controller_conn()) < 0) - return SLURM_SOCKET_ERROR; + if ((fd = slurm_open_controller_conn()) < 0) { + rc = SLURM_SOCKET_ERROR; + goto cleanup; + } rc =_send_and_recv_msg(fd, req, resp, 0); /* If the backup controller is in the process of assuming @@ -802,6 +829,10 @@ int slurm_send_recv_controller_msg(slurm_msg_t *req, slurm_msg_t *resp) return SLURM_SOCKET_ERROR; rc =_send_and_recv_msg(fd, req, resp, 0); } + + cleanup: + if (rc != SLURM_SUCCESS) + _remap_slurmctld_errno(); return rc; } @@ -837,16 +868,22 @@ int slurm_send_only_controller_msg(slurm_msg_t *req) /* * Open connection to SLURM controller: */ - if ((fd = slurm_open_controller_conn()) < 0) - return SLURM_SOCKET_ERROR; + if ((fd = slurm_open_controller_conn()) < 0) { + rc = SLURM_SOCKET_ERROR; + goto cleanup; + } rc = slurm_send_node_msg(fd, req); - if (slurm_shutdown_msg_conn(fd) < 0) - return SLURM_SOCKET_ERROR; - - return rc; + if (slurm_shutdown_msg_conn(fd) < 0) { + rc = SLURM_SOCKET_ERROR; + goto cleanup; + } + cleanup: + if (rc != SLURM_SUCCESS) + _remap_slurmctld_errno(); + return rc; } /* diff --git a/src/common/slurm_protocol_socket_implementation.c b/src/common/slurm_protocol_socket_implementation.c index acb74e4aec6..29585488967 100644 --- a/src/common/slurm_protocol_socket_implementation.c +++ b/src/common/slurm_protocol_socket_implementation.c @@ -123,8 +123,6 @@ ssize_t _slurm_msg_recvfrom_timeout(slurm_fd fd, char **pbuf, size_t *lenp, len = _slurm_recv_timeout( fd, (char *)&msglen, sizeof(msglen), 0, tmout ); - if (len < 0) return SLURM_ERROR; - if (len < ((ssize_t) sizeof(msglen))) slurm_seterrno_ret(SLURM_COMMUNICATIONS_RECEIVE_ERROR); @@ -141,6 +139,7 @@ ssize_t _slurm_msg_recvfrom_timeout(slurm_fd fd, char **pbuf, size_t *lenp, if (_slurm_recv_timeout(fd, *pbuf, msglen, 0, tmout) != msglen) { xfree(*pbuf); *pbuf = NULL; + slurm_seterrno_ret(SLURM_COMMUNICATIONS_RECEIVE_ERROR); return SLURM_PROTOCOL_ERROR; } @@ -175,13 +174,16 @@ ssize_t _slurm_msg_sendto_timeout(slurm_fd fd, char *buffer, size_t size, timeout ); if (len < sizeof(usize)) { - len = SLURM_PROTOCOL_ERROR; - goto done; + len = SLURM_PROTOCOL_ERROR; + slurm_seterrno(SLURM_COMMUNICATIONS_SEND_ERROR); + goto done; } - if ((len = _slurm_send_timeout(fd, buffer, size, 0, timeout)) < 0) - goto done; - else if (len < size) { + if ((len = _slurm_send_timeout(fd, buffer, size, 0, timeout)) < 0) { + slurm_seterrno(SLURM_COMMUNICATIONS_SEND_ERROR); + goto done; + } + if (len < size) { len = SLURM_PROTOCOL_ERROR; slurm_seterrno(SLURM_PROTOCOL_SOCKET_IMPL_NOT_ALL_DATA_SENT); goto done; @@ -216,21 +218,23 @@ int _slurm_send_timeout(slurm_fd fd, char *buf, size_t size, goto done; } if ((rc = poll(&ufds, 1, timeout)) <= 0) { - if ((rc == 0) || (errno == EINTR)) - continue; - else { - sent = SLURM_ERROR; - goto done; - } + if ((rc == 0) || (errno == EINTR)) + continue; + else { + slurm_seterrno(SLURM_COMMUNICATIONS_SEND_ERROR); + sent = SLURM_ERROR; + goto done; + } } rc = _slurm_send(fd, &buf[sent], (size - sent), flags); if (rc < 0) { - if (errno == EINTR) - continue; - else { - sent = SLURM_ERROR; - goto done; - } + if (errno == EINTR) + continue; + else { + slurm_seterrno(SLURM_COMMUNICATIONS_SEND_ERROR); + sent = SLURM_ERROR; + goto done; + } } if (rc == 0) { slurm_seterrno(SLURM_PROTOCOL_SOCKET_ZERO_BYTES_SENT); @@ -278,8 +282,10 @@ int _slurm_recv_timeout(slurm_fd fd, char *buffer, size_t size, if ((errno == EINTR) || (rc == 0)) continue; else { - recvlen = SLURM_ERROR; - goto done; + slurm_seterrno( + SLURM_COMMUNICATIONS_RECEIVE_ERROR); + recvlen = SLURM_ERROR; + goto done; } } rc = _slurm_recv(fd, &buffer[recvlen], size - recvlen, flags); @@ -287,6 +293,8 @@ int _slurm_recv_timeout(slurm_fd fd, char *buffer, size_t size, if (errno == EINTR) continue; else { + slurm_seterrno( + SLURM_COMMUNICATIONS_RECEIVE_ERROR); recvlen = SLURM_ERROR; goto done; } -- GitLab