From c06d45ead984e06ee6d1353713658f3bdd1fc96d Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 8 Jul 2003 22:43:35 +0000
Subject: [PATCH] Added new error codes for commununication failures for
 message traffic specifically to slurm controller.

---
 slurm/slurm_errno.h                           |  6 ++
 src/api/allocate.c                            |  6 +-
 src/api/reconfigure.c                         |  8 +--
 src/common/slurm_errno.c                      |  9 +++
 src/common/slurm_protocol_api.c               | 61 +++++++++++++++----
 .../slurm_protocol_socket_implementation.c    | 50 ++++++++-------
 6 files changed, 100 insertions(+), 40 deletions(-)

diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h
index ee5e6eb3742..dfbfce5af98 100644
--- a/slurm/slurm_errno.h
+++ b/slurm/slurm_errno.h
@@ -84,6 +84,12 @@ enum {
 	SLURM_PROTOCOL_AUTHENTICATION_ERROR,
 	SLURM_PROTOCOL_INSANE_MSG_LENGTH,
 
+	/* communication failures to/from slurmctld */
+	SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR =     1800,
+	SLURMCTLD_COMMUNICATIONS_SEND_ERROR,
+	SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR,
+	SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR,
+
 	/* _info.c/communcation layer RESPONSE_SLURM_RC message codes */
 	SLURM_NO_CHANGE_IN_DATA =			1900,
 
diff --git a/src/api/allocate.c b/src/api/allocate.c
index 24107cc0bd1..0895109f3f3 100644
--- a/src/api/allocate.c
+++ b/src/api/allocate.c
@@ -87,7 +87,7 @@ slurm_allocate_resources (job_desc_msg_t *req,
 		req->alloc_node = NULL;
 
 	if (rc == SLURM_SOCKET_ERROR) 
-		slurm_seterrno_ret(SLURM_COMMUNICATIONS_SEND_ERROR);
+		return SLURM_SOCKET_ERROR;
 
 	switch (resp_msg.msg_type) {
 	case RESPONSE_SLURM_RC:
@@ -125,7 +125,7 @@ int slurm_job_will_run (job_desc_msg_t *req,
 	req_msg.data     = req; 
 
 	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
-		slurm_seterrno_ret(SLURM_COMMUNICATIONS_SEND_ERROR);
+		return SLURM_SOCKET_ERROR;
 
 	switch (resp_msg.msg_type) {
 	case RESPONSE_SLURM_RC:
@@ -181,7 +181,7 @@ slurm_allocate_resources_and_run (job_desc_msg_t *req,
 		req->alloc_node = NULL;
 
 	if (rc == SLURM_SOCKET_ERROR) 
-		slurm_seterrno_ret(SLURM_COMMUNICATIONS_SEND_ERROR);
+		return SLURM_SOCKET_ERROR;
 
 
 	switch (resp_msg.msg_type) {
diff --git a/src/api/reconfigure.c b/src/api/reconfigure.c
index ca88fbefc00..37f027563cf 100644
--- a/src/api/reconfigure.c
+++ b/src/api/reconfigure.c
@@ -121,16 +121,16 @@ _send_message_controller (enum controller_id dest, slurm_msg_t *req)
 	slurm_msg_t resp_msg ;
 
 	if ((fd = slurm_open_controller_conn_spec(dest)) < 0)
-		slurm_seterrno_ret(SLURM_COMMUNICATIONS_CONNECTION_ERROR);
+		slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR);
 
 	if (slurm_send_node_msg(fd, req) < 0) 
-		slurm_seterrno_ret(SLURM_COMMUNICATIONS_SEND_ERROR);
+		slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_SEND_ERROR);
 
 	if ((rc = slurm_receive_msg(fd, &resp_msg, 0)) < 0)
-		slurm_seterrno_ret(SLURM_COMMUNICATIONS_RECEIVE_ERROR);
+		slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR);
 
 	if (slurm_shutdown_msg_conn(fd) != SLURM_SUCCESS)
-		slurm_seterrno_ret(SLURM_COMMUNICATIONS_SHUTDOWN_ERROR);
+		slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR);
 
 	if (resp_msg.msg_type != RESPONSE_SLURM_RC)
 		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c
index f280e5f2d8c..31dcabf501c 100644
--- a/src/common/slurm_errno.c
+++ b/src/common/slurm_errno.c
@@ -72,6 +72,15 @@ static slurm_errtab_t slurm_errtab[] = {
         { SLURM_PROTOCOL_INSANE_MSG_LENGTH,
           "Insane message length"                               },
 
+	/* communication failures to/from slurmctld */
+	{ SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR,
+	  "Unable to contact slurm controller (connect failure)" },
+	{ SLURMCTLD_COMMUNICATIONS_SEND_ERROR,
+	  "Unable to contact slurm controller (send failure)"    },
+	{ SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR,
+	  "Unable to contact slurm controller (receive failure)" },
+	{ SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR,
+	  "Unable to contact slurm controller (shutdown failure)"},
 
 	/* _info.c/communcation layer RESPONSE_SLURM_RC message codes */
 
diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c
index 26c82cbecc6..16d4751ed48 100644
--- a/src/common/slurm_protocol_api.c
+++ b/src/common/slurm_protocol_api.c
@@ -65,6 +65,9 @@ static slurm_protocol_config_t proto_conf_default;
 static slurm_protocol_config_t *proto_conf = &proto_conf_default;
 static slurm_ctl_conf_t slurmctld_conf;
 
+/* STATIC FUNCTIONS */
+static void _remap_slurmctld_errno(void);
+
 /**********************************************************************\
  * protocol configuration functions                
 \**********************************************************************/
@@ -170,6 +173,21 @@ uint16_t slurm_get_wait_time(void)
         return slurmctld_conf.wait_time;
 }
 
+/* Change general slurm communication errors to slurmctld specific errors */
+static void _remap_slurmctld_errno(void)
+{
+	int err = slurm_get_errno();
+
+	if (err == SLURM_COMMUNICATIONS_CONNECTION_ERROR)
+		slurm_seterrno(SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR);
+	else if (err ==  SLURM_COMMUNICATIONS_SEND_ERROR)
+		slurm_seterrno(SLURMCTLD_COMMUNICATIONS_SEND_ERROR);
+	else if (err == SLURM_COMMUNICATIONS_RECEIVE_ERROR)
+		slurm_seterrno(SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR);
+	else if (err == SLURM_COMMUNICATIONS_SHUTDOWN_ERROR)
+		slurm_seterrno(SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR);
+}
+
 /**********************************************************************\
  * general message management functions used by slurmctld, slurmd
 \**********************************************************************/
@@ -208,7 +226,10 @@ slurm_fd slurm_init_msg_engine(slurm_addr *addr)
  */
 int slurm_shutdown_msg_engine(slurm_fd fd)
 {
-        return _slurm_close(fd);
+	int rc = _slurm_close(fd);
+	if (rc)
+		slurm_seterrno(SLURM_COMMUNICATIONS_SHUTDOWN_ERROR);
+	return rc;
 }
 
 /* 
@@ -265,7 +286,7 @@ slurm_fd slurm_open_controller_conn()
         debug("Failed to contact secondary controller: %m");
 
     fail:
-        slurm_seterrno_ret(SLURM_COMMUNICATIONS_CONNECTION_ERROR);
+        slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR);
 }
 
 /* calls connect to make a connection-less datagram connection to the 
@@ -276,6 +297,7 @@ slurm_fd slurm_open_controller_conn()
 slurm_fd slurm_open_controller_conn_spec(enum controller_id dest)
 {
         slurm_addr *addr;
+	slurm_fd rc;
 
         if (slurm_api_set_default_config() < 0) {
                 debug3("Error: Unable to set default config");
@@ -288,7 +310,10 @@ slurm_fd slurm_open_controller_conn_spec(enum controller_id dest)
 
         if (!addr) return SLURM_ERROR;
 
-        return slurm_open_msg_conn(addr);
+	rc = slurm_open_msg_conn(addr);
+	if (rc == -1)
+		_remap_slurmctld_errno();
+	return rc;
 }
 
 /* In the bsd implmentation maps directly to a accept call 
@@ -781,10 +806,12 @@ _send_and_recv_msg(slurm_fd fd, slurm_msg_t *req, slurm_msg_t *resp,
 int slurm_send_recv_controller_msg(slurm_msg_t *req, slurm_msg_t *resp)
 {
         slurm_fd fd = -1;
-	int rc;
+	int rc = SLURM_SUCCESS;
 
-	if ((fd = slurm_open_controller_conn()) < 0)
-                return SLURM_SOCKET_ERROR;
+	if ((fd = slurm_open_controller_conn()) < 0) {
+		rc = SLURM_SOCKET_ERROR;
+		goto cleanup;
+	}
 
         rc =_send_and_recv_msg(fd, req, resp, 0);
 	/* If the backup controller is in the process of assuming 
@@ -802,6 +829,10 @@ int slurm_send_recv_controller_msg(slurm_msg_t *req, slurm_msg_t *resp)
                 	return SLURM_SOCKET_ERROR;
 		rc =_send_and_recv_msg(fd, req, resp, 0);
 	}
+
+      cleanup:
+	if (rc != SLURM_SUCCESS) 
+ 		_remap_slurmctld_errno(); 
 	return rc;
 }
 
@@ -837,16 +868,22 @@ int slurm_send_only_controller_msg(slurm_msg_t *req)
         /*
          *  Open connection to SLURM controller:
          */
-        if ((fd = slurm_open_controller_conn()) < 0)
-                return SLURM_SOCKET_ERROR;
+	if ((fd = slurm_open_controller_conn()) < 0) {
+		rc = SLURM_SOCKET_ERROR;
+		goto cleanup;
+	}
 
         rc = slurm_send_node_msg(fd, req);
 
-        if (slurm_shutdown_msg_conn(fd) < 0)
-                return SLURM_SOCKET_ERROR;
-
-        return rc;
+	if (slurm_shutdown_msg_conn(fd) < 0) {
+		rc = SLURM_SOCKET_ERROR;
+		goto cleanup;
+	}
 
+      cleanup:
+	if (rc != SLURM_SUCCESS)
+		_remap_slurmctld_errno();
+	return rc;
 }
 
 /* 
diff --git a/src/common/slurm_protocol_socket_implementation.c b/src/common/slurm_protocol_socket_implementation.c
index acb74e4aec6..29585488967 100644
--- a/src/common/slurm_protocol_socket_implementation.c
+++ b/src/common/slurm_protocol_socket_implementation.c
@@ -123,8 +123,6 @@ ssize_t _slurm_msg_recvfrom_timeout(slurm_fd fd, char **pbuf, size_t *lenp,
         len = _slurm_recv_timeout( fd, (char *)&msglen, 
                                    sizeof(msglen), 0, tmout );
 
-        if (len < 0) return SLURM_ERROR;
-
         if (len < ((ssize_t) sizeof(msglen))) 
                 slurm_seterrno_ret(SLURM_COMMUNICATIONS_RECEIVE_ERROR);
 
@@ -141,6 +139,7 @@ ssize_t _slurm_msg_recvfrom_timeout(slurm_fd fd, char **pbuf, size_t *lenp,
         if (_slurm_recv_timeout(fd, *pbuf, msglen, 0, tmout) != msglen) {
                 xfree(*pbuf);
                 *pbuf = NULL;
+		slurm_seterrno_ret(SLURM_COMMUNICATIONS_RECEIVE_ERROR);
                 return SLURM_PROTOCOL_ERROR;
         }
 
@@ -175,13 +174,16 @@ ssize_t _slurm_msg_sendto_timeout(slurm_fd fd, char *buffer, size_t size,
                                    timeout );
 
         if (len < sizeof(usize)) {
-                len = SLURM_PROTOCOL_ERROR;
-                goto done;
+		len = SLURM_PROTOCOL_ERROR;
+		slurm_seterrno(SLURM_COMMUNICATIONS_SEND_ERROR);
+		goto done;
         }
 
-        if ((len = _slurm_send_timeout(fd, buffer, size, 0, timeout)) < 0)
-                goto done;
-        else if (len < size) {
+	if ((len = _slurm_send_timeout(fd, buffer, size, 0, timeout)) < 0) {
+		slurm_seterrno(SLURM_COMMUNICATIONS_SEND_ERROR);
+		goto done;
+	}
+        if (len < size) {
                 len = SLURM_PROTOCOL_ERROR;
                 slurm_seterrno(SLURM_PROTOCOL_SOCKET_IMPL_NOT_ALL_DATA_SENT);
                 goto done;
@@ -216,21 +218,23 @@ int _slurm_send_timeout(slurm_fd fd, char *buf, size_t size,
                         goto done;
                 }
                 if ((rc = poll(&ufds, 1, timeout)) <= 0) {
-                        if ((rc == 0) || (errno == EINTR)) 
-                                continue;
-                         else {
-                                sent = SLURM_ERROR;
-                                goto done;
-                         }
+			if ((rc == 0) || (errno == EINTR)) 
+ 				continue;
+			else {
+				slurm_seterrno(SLURM_COMMUNICATIONS_SEND_ERROR);
+				sent = SLURM_ERROR;
+				goto done;
+			}
                 }
                 rc = _slurm_send(fd, &buf[sent], (size - sent), flags);
                 if (rc < 0) {
-                        if (errno == EINTR)
-                                continue;
-                        else {
-                                sent = SLURM_ERROR;
-                                goto done;
-                        }
+ 			if (errno == EINTR)
+				continue;
+			else {
+ 				slurm_seterrno(SLURM_COMMUNICATIONS_SEND_ERROR);
+				sent = SLURM_ERROR;
+				goto done;
+			}
                 }
                 if (rc == 0) {
                         slurm_seterrno(SLURM_PROTOCOL_SOCKET_ZERO_BYTES_SENT);
@@ -278,8 +282,10 @@ int _slurm_recv_timeout(slurm_fd fd, char *buffer, size_t size,
                         if ((errno == EINTR) || (rc == 0))
                                 continue;
                         else {
-                                recvlen = SLURM_ERROR; 
-                                goto done;
+ 				slurm_seterrno(
+					SLURM_COMMUNICATIONS_RECEIVE_ERROR);
+ 				recvlen = SLURM_ERROR; 
+  				goto done;
                         }
                 } 
                 rc = _slurm_recv(fd, &buffer[recvlen], size - recvlen, flags);
@@ -287,6 +293,8 @@ int _slurm_recv_timeout(slurm_fd fd, char *buffer, size_t size,
                         if (errno == EINTR)
                                 continue;
                         else {
+				slurm_seterrno(
+					SLURM_COMMUNICATIONS_RECEIVE_ERROR);
                                 recvlen = SLURM_ERROR; 
                                 goto done;
                         }
-- 
GitLab