diff --git a/src/api/signal.c b/src/api/signal.c index 496c4e62df38720cd20947b01af5ad5d242e99f7..0cf04ad1b9e2e0e716dc1e68644e57fc6a01c696 100644 --- a/src/api/signal.c +++ b/src/api/signal.c @@ -274,8 +274,10 @@ static int _signal_batch_script_step( msg.data = &rpc; msg.address = allocation->node_addr[0]; - slurm_send_recv_rc_msg_only_one(&msg, &rc, 10); - + if (slurm_send_recv_rc_msg_only_one(&msg, &rc, 0) < 0) { + error("_signal_batch_script_step: %m"); + rc = -1; + } return rc; } @@ -351,8 +353,11 @@ _thr_send_recv_rc_msg(void *args) pthread_cond_t *cond = params->cond; int *active = params->active; - slurm_send_recv_rc_msg_only_one(params->msg, - params->rc, params->timeout); + if (slurm_send_recv_rc_msg_only_one(params->msg, params->rc, + params->timeout) < 0) { + error("_thr_send_recv_rc_msg: %m"); + *params->rc = -1; + } xfree(args); slurm_mutex_lock(lock); @@ -546,7 +551,7 @@ static int _terminate_batch_script_step( msg.address = allocation->node_addr[0]; i = slurm_send_recv_rc_msg_only_one(&msg, &rc, 10); - if (i != SLURM_SUCCESS) + if (i != 0) rc = i; return rc; diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 22f91290f4c890d321c2bb1aff1484672cc84862..e8bce0ca3d30f4cef19a5eb8db84e573b87abe2d 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -1715,7 +1715,7 @@ int slurm_send_recv_rc_msg_only_one(slurm_msg_t *req, int *rc, int timeout) slurm_fd fd = -1; List ret_list = NULL; ret_types_t *ret_type = NULL; - int ret_c = SLURM_SUCCESS; + int ret_c = 0; forward_init(&req->forward, NULL); req->ret_list = NULL; @@ -1723,9 +1723,8 @@ int slurm_send_recv_rc_msg_only_one(slurm_msg_t *req, int *rc, int timeout) /* no need to init forward_struct_init here */ if ((fd = slurm_open_msg_conn(&req->address)) < 0) { - return SLURM_SOCKET_ERROR; + return -1; } - ret_list = _send_recv_rc_msg(fd, req, timeout); if(ret_list) { @@ -1737,12 +1736,15 @@ int slurm_send_recv_rc_msg_only_one(slurm_msg_t *req, int *rc, int timeout) if(ret_type) { *rc = ret_type->msg_rc; - ret_c = ret_type->err; + // make sure we only send 0 or -1 for an error + if(ret_type->err != 0) + //ret_c = ret_type->err; + ret_c = -1; destroy_ret_types(ret_type); } list_destroy(ret_list); } else - ret_c = SLURM_ERROR; + ret_c = -1; return ret_c; } @@ -1754,7 +1756,7 @@ int slurm_send_recv_controller_rc_msg(slurm_msg_t *req, int *rc) slurm_fd fd = -1; List ret_list = NULL; ret_types_t *ret_type = NULL; - int ret_val = SLURM_SUCCESS; + int ret_val = 0; forward_init(&req->forward, NULL); req->ret_list = NULL; @@ -1762,7 +1764,7 @@ int slurm_send_recv_controller_rc_msg(slurm_msg_t *req, int *rc) /* no need to init forward_struct_init here */ if ((fd = slurm_open_controller_conn()) < 0) - return SLURM_SOCKET_ERROR; + return -1; ret_list = _send_recv_rc_msg(fd, req, 0); if(ret_list) { @@ -1773,12 +1775,15 @@ int slurm_send_recv_controller_rc_msg(slurm_msg_t *req, int *rc) if(ret_type) { *rc = ret_type->msg_rc; - ret_val = ret_type->err; + // make sure we only send 0 or -1 for an error + if(ret_type->err != 0) + //ret_c = ret_type->err; + ret_val = -1; destroy_ret_types(ret_type); } list_destroy(ret_list); } else - ret_val = SLURM_ERROR; + ret_val = -1; return ret_val; } diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 537a90a713852008faf22e132171c2ad8b50d3e7..c19225a59c8557e288212b37dd51474c8ac20b8e 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -557,13 +557,15 @@ List slurm_send_recv_rc_packed_msg(slurm_msg_t *req, int timeout); List slurm_send_recv_rc_msg(slurm_msg_t *req, int timeout); /* - * Same as above, but only to one node + * Same as above, but only to one node + * returns 0 on success, -1 on failure and sets errno */ int slurm_send_recv_rc_msg_only_one(slurm_msg_t *req, int *rc, int timeout); /* * Same as above, but send to controller + * returns 0 on success, -1 on failure and sets errno */ int slurm_send_recv_controller_rc_msg(slurm_msg_t *req, int *rc); diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index e77486a474a67ae38a3e250df65f269ef58a06c4..7c23af6ae54c7a5afdd2b9427c5f16cb0872d651 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -537,8 +537,7 @@ _one_step_complete_msg(slurmd_job_t *job, int first, int last) /* this is the base of the tree, its parent is slurmctld */ debug3("Rank %d sending complete to slurmctld, range %d to %d", step_complete.rank, first, last); - if (slurm_send_recv_controller_rc_msg(&req, &rc) - != SLURM_SUCCESS) + if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0) error("Rank %d failed sending step completion message" " to slurmctld (parent)", step_complete.rank); goto finished; @@ -554,13 +553,13 @@ _one_step_complete_msg(slurmd_job_t *job, int first, int last) if (i) sleep(1); retcode = slurm_send_recv_rc_msg_only_one(&req, &rc, 10); - if (retcode == SLURM_SUCCESS && rc == 0) + if (retcode == 0 && rc == 0) goto finished; } /* on error AGAIN, send to the slurmctld instead */ debug3("Rank %d sending complete to slurmctld instead, range %d to %d", step_complete.rank, first, last); - if (slurm_send_recv_controller_rc_msg(&req, &rc) != SLURM_SUCCESS) + if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0) error("Rank %d failed sending step completion message" " directly to slurmctld", step_complete.rank); finished: @@ -1391,7 +1390,7 @@ _complete_batch_script(slurmd_job_t *job, int err, int status) /* Note: these log messages don't go to slurmd.log from here */ for (i=0; i<=MAX_RETRY; i++) { - if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) >= 0) + if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) == 0) break; info("Retrying job complete RPC for %u.%u", job->jobid, job->stepid);