From 213059efe7f8d1fc72d5f876ae949730d8e98b16 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@llnl.gov>
Date: Tue, 30 May 2006 16:40:24 +0000
Subject: [PATCH] updated return codes from slurm_send_recv_rc_msg_only_one and
 slurm_send_recv_contoller_rc_msg to either 0 or -1 and added error checking
 to api/signal.c

---
 src/api/signal.c                | 15 ++++++++++-----
 src/common/slurm_protocol_api.c | 23 ++++++++++++++---------
 src/common/slurm_protocol_api.h |  4 +++-
 src/slurmd/slurmstepd/mgr.c     |  9 ++++-----
 4 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/src/api/signal.c b/src/api/signal.c
index 496c4e62df3..0cf04ad1b9e 100644
--- a/src/api/signal.c
+++ b/src/api/signal.c
@@ -274,8 +274,10 @@ static int _signal_batch_script_step(
 	msg.data = &rpc;
 	msg.address = allocation->node_addr[0];
 
-	slurm_send_recv_rc_msg_only_one(&msg, &rc, 10);
-	
+	if (slurm_send_recv_rc_msg_only_one(&msg, &rc, 0) < 0) {
+		error("_signal_batch_script_step: %m");
+		rc = -1;
+	}
 	return rc;
 }
 
@@ -351,8 +353,11 @@ _thr_send_recv_rc_msg(void *args)
 	pthread_cond_t *cond = params->cond;
 	int *active = params->active;
 
-	slurm_send_recv_rc_msg_only_one(params->msg, 
-					params->rc, params->timeout);
+	if (slurm_send_recv_rc_msg_only_one(params->msg, params->rc, 
+					    params->timeout) < 0) {
+		error("_thr_send_recv_rc_msg: %m");
+		*params->rc = -1;
+	}
 
 	xfree(args);
 	slurm_mutex_lock(lock);
@@ -546,7 +551,7 @@ static int _terminate_batch_script_step(
 	msg.address = allocation->node_addr[0];
 
 	i = slurm_send_recv_rc_msg_only_one(&msg, &rc, 10);
-	if (i != SLURM_SUCCESS)
+	if (i != 0)
 		rc = i;
 
 	return rc;
diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c
index 22f91290f4c..e8bce0ca3d3 100644
--- a/src/common/slurm_protocol_api.c
+++ b/src/common/slurm_protocol_api.c
@@ -1715,7 +1715,7 @@ int slurm_send_recv_rc_msg_only_one(slurm_msg_t *req, int *rc, int timeout)
 	slurm_fd fd = -1;
 	List ret_list = NULL;
 	ret_types_t *ret_type = NULL;
-	int ret_c = SLURM_SUCCESS;
+	int ret_c = 0;
 
 	forward_init(&req->forward, NULL);
 	req->ret_list = NULL;
@@ -1723,9 +1723,8 @@ int slurm_send_recv_rc_msg_only_one(slurm_msg_t *req, int *rc, int timeout)
 	/* no need to init forward_struct_init here */
 		
 	if ((fd = slurm_open_msg_conn(&req->address)) < 0) {
-		return SLURM_SOCKET_ERROR;
+		return -1;
 	}
-
 			
 	ret_list = _send_recv_rc_msg(fd, req, timeout);
 	if(ret_list) {
@@ -1737,12 +1736,15 @@ int slurm_send_recv_rc_msg_only_one(slurm_msg_t *req, int *rc, int timeout)
 	
 		if(ret_type) {
 			*rc = ret_type->msg_rc;
-			ret_c = ret_type->err;
+			// make sure we only send 0 or -1 for an error
+			if(ret_type->err != 0) 
+				//ret_c = ret_type->err;
+				ret_c = -1;
 			destroy_ret_types(ret_type);
 		}
 		list_destroy(ret_list);
 	} else 
-		ret_c = SLURM_ERROR;
+		ret_c = -1;
 	return ret_c;
 }
 
@@ -1754,7 +1756,7 @@ int slurm_send_recv_controller_rc_msg(slurm_msg_t *req, int *rc)
 	slurm_fd fd = -1;
 	List ret_list = NULL;
 	ret_types_t *ret_type = NULL;
-	int ret_val = SLURM_SUCCESS;
+	int ret_val = 0;
 
 	forward_init(&req->forward, NULL);
 	req->ret_list = NULL;
@@ -1762,7 +1764,7 @@ int slurm_send_recv_controller_rc_msg(slurm_msg_t *req, int *rc)
 	/* no need to init forward_struct_init here */
 		
 	if ((fd = slurm_open_controller_conn()) < 0)
-		return SLURM_SOCKET_ERROR;
+		return -1;
 	ret_list = _send_recv_rc_msg(fd, req, 0);
 	
 	if(ret_list) {
@@ -1773,12 +1775,15 @@ int slurm_send_recv_controller_rc_msg(slurm_msg_t *req, int *rc)
 		
 		if(ret_type) {
 			*rc = ret_type->msg_rc;
-			ret_val = ret_type->err;
+			// make sure we only send 0 or -1 for an error
+			if(ret_type->err != 0) 
+				//ret_c = ret_type->err;
+				ret_val = -1;
 			destroy_ret_types(ret_type);
 		}
 		list_destroy(ret_list);
 	} else 
-		ret_val = SLURM_ERROR;
+		ret_val = -1;
 	return ret_val;
 }
 
diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h
index 537a90a7138..c19225a59c8 100644
--- a/src/common/slurm_protocol_api.h
+++ b/src/common/slurm_protocol_api.h
@@ -557,13 +557,15 @@ List slurm_send_recv_rc_packed_msg(slurm_msg_t *req, int timeout);
 List slurm_send_recv_rc_msg(slurm_msg_t *req, int timeout);
 
 /*
- *  Same as above, but only to one node
+ *  Same as above, but only to one node 
+ *  returns 0 on success, -1 on failure and sets errno
  */
 
 int slurm_send_recv_rc_msg_only_one(slurm_msg_t *req, int *rc, int timeout);
 
 /*
  *  Same as above, but send to controller
+ *  returns 0 on success, -1 on failure and sets errno
  */
 int slurm_send_recv_controller_rc_msg(slurm_msg_t *req, int *rc);
 
diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c
index e77486a474a..7c23af6ae54 100644
--- a/src/slurmd/slurmstepd/mgr.c
+++ b/src/slurmd/slurmstepd/mgr.c
@@ -537,8 +537,7 @@ _one_step_complete_msg(slurmd_job_t *job, int first, int last)
 		/* this is the base of the tree, its parent is slurmctld */
 		debug3("Rank %d sending complete to slurmctld, range %d to %d",
 		       step_complete.rank, first, last);
-		if (slurm_send_recv_controller_rc_msg(&req, &rc)
-		    != SLURM_SUCCESS)
+		if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0)
 			error("Rank %d failed sending step completion message"
 			      " to slurmctld (parent)", step_complete.rank);
 		goto finished;
@@ -554,13 +553,13 @@ _one_step_complete_msg(slurmd_job_t *job, int first, int last)
 		if (i)
 			sleep(1);
 		retcode = slurm_send_recv_rc_msg_only_one(&req, &rc, 10);
-		if (retcode == SLURM_SUCCESS && rc == 0)
+		if (retcode == 0 && rc == 0)
 			goto finished;
 	}
 	/* on error AGAIN, send to the slurmctld instead */
 	debug3("Rank %d sending complete to slurmctld instead, range %d to %d",
 	       step_complete.rank, first, last);
-	if (slurm_send_recv_controller_rc_msg(&req, &rc) != SLURM_SUCCESS)
+	if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0)
 		error("Rank %d failed sending step completion message"
 		      " directly to slurmctld", step_complete.rank);
 finished:
@@ -1391,7 +1390,7 @@ _complete_batch_script(slurmd_job_t *job, int err, int status)
 
 	/* Note: these log messages don't go to slurmd.log from here */
 	for (i=0; i<=MAX_RETRY; i++) {
-		if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) >= 0)
+		if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) == 0)
 			break;
 		info("Retrying job complete RPC for %u.%u",
 		     job->jobid, job->stepid);
-- 
GitLab