diff --git a/src/common/forward.c b/src/common/forward.c index a93e7cbb7960e681d2e8f27207c07437395b3ac5..cc271f982675f9fb8fe53ec0e38a0ecd4b78c2bf 100644 --- a/src/common/forward.c +++ b/src/common/forward.c @@ -176,8 +176,8 @@ void *_forward_thread(void *arg) type->msg_rc = SLURM_ERROR; ret_data_info->data = NULL; } else { - type->type = msg.msg_type; - type->msg_rc = ((return_code_msg_t *)msg.data)->return_code; + type->type = msg.msg_type; + type->msg_rc = slurm_get_return_code(type->type, msg.data); ret_data_info->data = msg.data; g_slurm_auth_destroy(msg.auth_cred); } @@ -275,6 +275,12 @@ extern int forward_msg(forward_struct_t *forward_struct, int thr_count = 0; int *span = set_span(header->forward.cnt, 0); + if(!forward_struct->ret_list) { + error("didn't get a ret_list from forward_struct"); + xfree(span); + return SLURM_ERROR; + } + slurm_mutex_init(&forward_struct->forward_mutex); pthread_cond_init(&forward_struct->notify, NULL); @@ -292,6 +298,7 @@ extern int forward_msg(forward_struct_t *forward_struct, forward_msg = &forward_struct->forward_msg[i]; forward_msg->ret_list = forward_struct->ret_list; + forward_msg->timeout = forward_struct->timeout; forward_msg->notify = &forward_struct->notify; forward_msg->forward_mutex = &forward_struct->forward_mutex; @@ -537,7 +544,7 @@ extern void forward_wait(slurm_msg_t * msg) int count = 0; ret_types_t *ret_type = NULL; ListIterator itr; - + /* wait for all the other messages on the tree under us */ if(msg->forward_struct_init == FORWARD_INIT && msg->forward_struct) { debug2("looking for %d", msg->forward_struct->fwd_cnt); @@ -656,3 +663,4 @@ void destroy_ret_types(void *object) xfree(ret_type); } } + diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 5e4aef6ae4c5274d1f7d24a21e9f974ef5576f76..b107d2abe0cd3fc2c990a0759a93bf6bbf91bb3f 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1107,6 +1107,9 @@ extern int slurm_free_msg_data(uint32_t type, void *data) case REQUEST_REATTACH_TASKS: slurm_free_reattach_tasks_request_msg(data); break; + case RESPONSE_REATTACH_TASKS: + slurm_free_reattach_tasks_response_msg(data); + break; case REQUEST_SIGNAL_JOB: slurm_free_signal_job_msg(data); break; @@ -1132,3 +1135,30 @@ extern int slurm_free_msg_data(uint32_t type, void *data) return SLURM_SUCCESS; } +extern uint32_t slurm_get_return_code(uint32_t type, void *data) +{ + uint32_t rc = 0; + + switch(type) { + case MESSAGE_EPILOG_COMPLETE: + rc = ((epilog_complete_msg_t *)data)->return_code; + break; + case MESSAGE_STAT_JOBACCT: + rc = ((stat_jobacct_msg_t *)data)->return_code; + break; + case RESPONSE_REATTACH_TASKS: + rc = ((reattach_tasks_response_msg_t *)data)->return_code; + break; + case RESPONSE_JOB_ID: + rc = ((job_id_response_msg_t *)data)->return_code; + break; + case RESPONSE_SLURM_RC: + rc = ((return_code_msg_t *)data)->return_code; + break; + default: + error("don't know the rc for type %u returning %u", type, rc); + break; + } + return rc; +} + diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index f5a664b2a16399b317e7284de739d2d2e2243b55..2c02b410cb503addab9f51003eacab668c12be4f 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -357,6 +357,7 @@ typedef struct step_complete_msg { typedef struct stat_jobacct_msg { uint32_t job_id; + uint32_t return_code; uint32_t step_id; uint32_t num_tasks; jobacctinfo_t *jobacct; @@ -539,6 +540,7 @@ typedef struct job_id_request_msg { typedef struct job_id_response_msg { uint32_t job_id; /* slurm job_id */ + uint32_t return_code; /* slurm return code */ } job_id_response_msg_t; typedef struct srun_ping_msg { @@ -716,6 +718,7 @@ void inline slurm_free_stat_jobacct_msg(stat_jobacct_msg_t *msg); void inline slurm_free_node_select_msg( node_info_select_request_msg_t *msg); extern int slurm_free_msg_data(uint32_t type, void *data); +extern uint32_t slurm_get_return_code(uint32_t type, void *data); extern char *job_reason_string(enum job_wait_reason inx); extern char *job_state_string(enum job_states inx); diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index 3b75c4741fca367377a5e19b5621c2c41589fc10..bb044a2077b286ff76e3504e548980a5afcbd76d 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -1197,7 +1197,7 @@ _rpc_stat_jobacct(slurm_msg_t *msg) resp = xmalloc(sizeof(stat_jobacct_msg_t)); resp->job_id = req->job_id; resp->step_id = req->step_id; - + resp->return_code = SLURM_SUCCESS; fd = stepd_connect(conf->spooldir, conf->node_name, req->job_id, req->step_id); if (fd == -1) { @@ -1283,6 +1283,7 @@ static void _rpc_pid2jid(slurm_msg_t *msg) if (stepd_pid_in_container(fd, req->job_pid) || req->job_pid == stepd_daemon_pid(fd)) { resp.job_id = stepd->jobid; + resp.return_code = SLURM_SUCCESS; found = true; close(fd); break; @@ -1502,6 +1503,8 @@ done: resp_msg.data = resp; resp_msg.msg_type = RESPONSE_REATTACH_TASKS; resp_msg.forward = msg->forward; + resp_msg.forward_struct = msg->forward_struct; + resp_msg.forward_struct_init = msg->forward_struct_init; resp_msg.ret_list = msg->ret_list; resp->node_name = xstrdup(conf->node_name); resp->srun_node_id = nodeid;