From 213059efe7f8d1fc72d5f876ae949730d8e98b16 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Tue, 30 May 2006 16:40:24 +0000 Subject: [PATCH] updated return codes from slurm_send_recv_rc_msg_only_one and slurm_send_recv_contoller_rc_msg to either 0 or -1 and added error checking to api/signal.c --- src/api/signal.c | 15 ++++++++++----- src/common/slurm_protocol_api.c | 23 ++++++++++++++--------- src/common/slurm_protocol_api.h | 4 +++- src/slurmd/slurmstepd/mgr.c | 9 ++++----- 4 files changed, 31 insertions(+), 20 deletions(-) diff --git a/src/api/signal.c b/src/api/signal.c index 496c4e62df3..0cf04ad1b9e 100644 --- a/src/api/signal.c +++ b/src/api/signal.c @@ -274,8 +274,10 @@ static int _signal_batch_script_step( msg.data = &rpc; msg.address = allocation->node_addr[0]; - slurm_send_recv_rc_msg_only_one(&msg, &rc, 10); - + if (slurm_send_recv_rc_msg_only_one(&msg, &rc, 0) < 0) { + error("_signal_batch_script_step: %m"); + rc = -1; + } return rc; } @@ -351,8 +353,11 @@ _thr_send_recv_rc_msg(void *args) pthread_cond_t *cond = params->cond; int *active = params->active; - slurm_send_recv_rc_msg_only_one(params->msg, - params->rc, params->timeout); + if (slurm_send_recv_rc_msg_only_one(params->msg, params->rc, + params->timeout) < 0) { + error("_thr_send_recv_rc_msg: %m"); + *params->rc = -1; + } xfree(args); slurm_mutex_lock(lock); @@ -546,7 +551,7 @@ static int _terminate_batch_script_step( msg.address = allocation->node_addr[0]; i = slurm_send_recv_rc_msg_only_one(&msg, &rc, 10); - if (i != SLURM_SUCCESS) + if (i != 0) rc = i; return rc; diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 22f91290f4c..e8bce0ca3d3 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -1715,7 +1715,7 @@ int slurm_send_recv_rc_msg_only_one(slurm_msg_t *req, int *rc, int timeout) slurm_fd fd = -1; List ret_list = NULL; ret_types_t *ret_type = NULL; - int ret_c = SLURM_SUCCESS; + int ret_c = 0; forward_init(&req->forward, NULL); req->ret_list = NULL; @@ -1723,9 +1723,8 @@ int slurm_send_recv_rc_msg_only_one(slurm_msg_t *req, int *rc, int timeout) /* no need to init forward_struct_init here */ if ((fd = slurm_open_msg_conn(&req->address)) < 0) { - return SLURM_SOCKET_ERROR; + return -1; } - ret_list = _send_recv_rc_msg(fd, req, timeout); if(ret_list) { @@ -1737,12 +1736,15 @@ int slurm_send_recv_rc_msg_only_one(slurm_msg_t *req, int *rc, int timeout) if(ret_type) { *rc = ret_type->msg_rc; - ret_c = ret_type->err; + // make sure we only send 0 or -1 for an error + if(ret_type->err != 0) + //ret_c = ret_type->err; + ret_c = -1; destroy_ret_types(ret_type); } list_destroy(ret_list); } else - ret_c = SLURM_ERROR; + ret_c = -1; return ret_c; } @@ -1754,7 +1756,7 @@ int slurm_send_recv_controller_rc_msg(slurm_msg_t *req, int *rc) slurm_fd fd = -1; List ret_list = NULL; ret_types_t *ret_type = NULL; - int ret_val = SLURM_SUCCESS; + int ret_val = 0; forward_init(&req->forward, NULL); req->ret_list = NULL; @@ -1762,7 +1764,7 @@ int slurm_send_recv_controller_rc_msg(slurm_msg_t *req, int *rc) /* no need to init forward_struct_init here */ if ((fd = slurm_open_controller_conn()) < 0) - return SLURM_SOCKET_ERROR; + return -1; ret_list = _send_recv_rc_msg(fd, req, 0); if(ret_list) { @@ -1773,12 +1775,15 @@ int slurm_send_recv_controller_rc_msg(slurm_msg_t *req, int *rc) if(ret_type) { *rc = ret_type->msg_rc; - ret_val = ret_type->err; + // make sure we only send 0 or -1 for an error + if(ret_type->err != 0) + //ret_c = ret_type->err; + ret_val = -1; destroy_ret_types(ret_type); } list_destroy(ret_list); } else - ret_val = SLURM_ERROR; + ret_val = -1; return ret_val; } diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 537a90a7138..c19225a59c8 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -557,13 +557,15 @@ List slurm_send_recv_rc_packed_msg(slurm_msg_t *req, int timeout); List slurm_send_recv_rc_msg(slurm_msg_t *req, int timeout); /* - * Same as above, but only to one node + * Same as above, but only to one node + * returns 0 on success, -1 on failure and sets errno */ int slurm_send_recv_rc_msg_only_one(slurm_msg_t *req, int *rc, int timeout); /* * Same as above, but send to controller + * returns 0 on success, -1 on failure and sets errno */ int slurm_send_recv_controller_rc_msg(slurm_msg_t *req, int *rc); diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index e77486a474a..7c23af6ae54 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -537,8 +537,7 @@ _one_step_complete_msg(slurmd_job_t *job, int first, int last) /* this is the base of the tree, its parent is slurmctld */ debug3("Rank %d sending complete to slurmctld, range %d to %d", step_complete.rank, first, last); - if (slurm_send_recv_controller_rc_msg(&req, &rc) - != SLURM_SUCCESS) + if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0) error("Rank %d failed sending step completion message" " to slurmctld (parent)", step_complete.rank); goto finished; @@ -554,13 +553,13 @@ _one_step_complete_msg(slurmd_job_t *job, int first, int last) if (i) sleep(1); retcode = slurm_send_recv_rc_msg_only_one(&req, &rc, 10); - if (retcode == SLURM_SUCCESS && rc == 0) + if (retcode == 0 && rc == 0) goto finished; } /* on error AGAIN, send to the slurmctld instead */ debug3("Rank %d sending complete to slurmctld instead, range %d to %d", step_complete.rank, first, last); - if (slurm_send_recv_controller_rc_msg(&req, &rc) != SLURM_SUCCESS) + if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0) error("Rank %d failed sending step completion message" " directly to slurmctld", step_complete.rank); finished: @@ -1391,7 +1390,7 @@ _complete_batch_script(slurmd_job_t *job, int err, int status) /* Note: these log messages don't go to slurmd.log from here */ for (i=0; i<=MAX_RETRY; i++) { - if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) >= 0) + if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) == 0) break; info("Retrying job complete RPC for %u.%u", job->jobid, job->stepid); -- GitLab