Skip to content
Snippets Groups Projects
Commit 1ca56ba9 authored by Danny Auble's avatar Danny Auble
Browse files

added function to get the return code from different messages and also made it...

added function to get the return code from different messages and also made it so reattach would wait for
 returning forwards
parent a8812e28
No related branches found
No related tags found
No related merge requests found
...@@ -176,8 +176,8 @@ void *_forward_thread(void *arg) ...@@ -176,8 +176,8 @@ void *_forward_thread(void *arg)
type->msg_rc = SLURM_ERROR; type->msg_rc = SLURM_ERROR;
ret_data_info->data = NULL; ret_data_info->data = NULL;
} else { } else {
type->type = msg.msg_type; type->type = msg.msg_type;
type->msg_rc = ((return_code_msg_t *)msg.data)->return_code; type->msg_rc = slurm_get_return_code(type->type, msg.data);
ret_data_info->data = msg.data; ret_data_info->data = msg.data;
g_slurm_auth_destroy(msg.auth_cred); g_slurm_auth_destroy(msg.auth_cred);
} }
...@@ -275,6 +275,12 @@ extern int forward_msg(forward_struct_t *forward_struct, ...@@ -275,6 +275,12 @@ extern int forward_msg(forward_struct_t *forward_struct,
int thr_count = 0; int thr_count = 0;
int *span = set_span(header->forward.cnt, 0); int *span = set_span(header->forward.cnt, 0);
if(!forward_struct->ret_list) {
error("didn't get a ret_list from forward_struct");
xfree(span);
return SLURM_ERROR;
}
slurm_mutex_init(&forward_struct->forward_mutex); slurm_mutex_init(&forward_struct->forward_mutex);
pthread_cond_init(&forward_struct->notify, NULL); pthread_cond_init(&forward_struct->notify, NULL);
...@@ -292,6 +298,7 @@ extern int forward_msg(forward_struct_t *forward_struct, ...@@ -292,6 +298,7 @@ extern int forward_msg(forward_struct_t *forward_struct,
forward_msg = &forward_struct->forward_msg[i]; forward_msg = &forward_struct->forward_msg[i];
forward_msg->ret_list = forward_struct->ret_list; forward_msg->ret_list = forward_struct->ret_list;
forward_msg->timeout = forward_struct->timeout; forward_msg->timeout = forward_struct->timeout;
forward_msg->notify = &forward_struct->notify; forward_msg->notify = &forward_struct->notify;
forward_msg->forward_mutex = &forward_struct->forward_mutex; forward_msg->forward_mutex = &forward_struct->forward_mutex;
...@@ -537,7 +544,7 @@ extern void forward_wait(slurm_msg_t * msg) ...@@ -537,7 +544,7 @@ extern void forward_wait(slurm_msg_t * msg)
int count = 0; int count = 0;
ret_types_t *ret_type = NULL; ret_types_t *ret_type = NULL;
ListIterator itr; ListIterator itr;
/* wait for all the other messages on the tree under us */ /* wait for all the other messages on the tree under us */
if(msg->forward_struct_init == FORWARD_INIT && msg->forward_struct) { if(msg->forward_struct_init == FORWARD_INIT && msg->forward_struct) {
debug2("looking for %d", msg->forward_struct->fwd_cnt); debug2("looking for %d", msg->forward_struct->fwd_cnt);
...@@ -656,3 +663,4 @@ void destroy_ret_types(void *object) ...@@ -656,3 +663,4 @@ void destroy_ret_types(void *object)
xfree(ret_type); xfree(ret_type);
} }
} }
...@@ -1107,6 +1107,9 @@ extern int slurm_free_msg_data(uint32_t type, void *data) ...@@ -1107,6 +1107,9 @@ extern int slurm_free_msg_data(uint32_t type, void *data)
case REQUEST_REATTACH_TASKS: case REQUEST_REATTACH_TASKS:
slurm_free_reattach_tasks_request_msg(data); slurm_free_reattach_tasks_request_msg(data);
break; break;
case RESPONSE_REATTACH_TASKS:
slurm_free_reattach_tasks_response_msg(data);
break;
case REQUEST_SIGNAL_JOB: case REQUEST_SIGNAL_JOB:
slurm_free_signal_job_msg(data); slurm_free_signal_job_msg(data);
break; break;
...@@ -1132,3 +1135,30 @@ extern int slurm_free_msg_data(uint32_t type, void *data) ...@@ -1132,3 +1135,30 @@ extern int slurm_free_msg_data(uint32_t type, void *data)
return SLURM_SUCCESS; return SLURM_SUCCESS;
} }
extern uint32_t slurm_get_return_code(uint32_t type, void *data)
{
uint32_t rc = 0;
switch(type) {
case MESSAGE_EPILOG_COMPLETE:
rc = ((epilog_complete_msg_t *)data)->return_code;
break;
case MESSAGE_STAT_JOBACCT:
rc = ((stat_jobacct_msg_t *)data)->return_code;
break;
case RESPONSE_REATTACH_TASKS:
rc = ((reattach_tasks_response_msg_t *)data)->return_code;
break;
case RESPONSE_JOB_ID:
rc = ((job_id_response_msg_t *)data)->return_code;
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *)data)->return_code;
break;
default:
error("don't know the rc for type %u returning %u", type, rc);
break;
}
return rc;
}
...@@ -357,6 +357,7 @@ typedef struct step_complete_msg { ...@@ -357,6 +357,7 @@ typedef struct step_complete_msg {
typedef struct stat_jobacct_msg { typedef struct stat_jobacct_msg {
uint32_t job_id; uint32_t job_id;
uint32_t return_code;
uint32_t step_id; uint32_t step_id;
uint32_t num_tasks; uint32_t num_tasks;
jobacctinfo_t *jobacct; jobacctinfo_t *jobacct;
...@@ -539,6 +540,7 @@ typedef struct job_id_request_msg { ...@@ -539,6 +540,7 @@ typedef struct job_id_request_msg {
typedef struct job_id_response_msg { typedef struct job_id_response_msg {
uint32_t job_id; /* slurm job_id */ uint32_t job_id; /* slurm job_id */
uint32_t return_code; /* slurm return code */
} job_id_response_msg_t; } job_id_response_msg_t;
typedef struct srun_ping_msg { typedef struct srun_ping_msg {
...@@ -716,6 +718,7 @@ void inline slurm_free_stat_jobacct_msg(stat_jobacct_msg_t *msg); ...@@ -716,6 +718,7 @@ void inline slurm_free_stat_jobacct_msg(stat_jobacct_msg_t *msg);
void inline slurm_free_node_select_msg( void inline slurm_free_node_select_msg(
node_info_select_request_msg_t *msg); node_info_select_request_msg_t *msg);
extern int slurm_free_msg_data(uint32_t type, void *data); extern int slurm_free_msg_data(uint32_t type, void *data);
extern uint32_t slurm_get_return_code(uint32_t type, void *data);
extern char *job_reason_string(enum job_wait_reason inx); extern char *job_reason_string(enum job_wait_reason inx);
extern char *job_state_string(enum job_states inx); extern char *job_state_string(enum job_states inx);
......
...@@ -1197,7 +1197,7 @@ _rpc_stat_jobacct(slurm_msg_t *msg) ...@@ -1197,7 +1197,7 @@ _rpc_stat_jobacct(slurm_msg_t *msg)
resp = xmalloc(sizeof(stat_jobacct_msg_t)); resp = xmalloc(sizeof(stat_jobacct_msg_t));
resp->job_id = req->job_id; resp->job_id = req->job_id;
resp->step_id = req->step_id; resp->step_id = req->step_id;
resp->return_code = SLURM_SUCCESS;
fd = stepd_connect(conf->spooldir, conf->node_name, fd = stepd_connect(conf->spooldir, conf->node_name,
req->job_id, req->step_id); req->job_id, req->step_id);
if (fd == -1) { if (fd == -1) {
...@@ -1283,6 +1283,7 @@ static void _rpc_pid2jid(slurm_msg_t *msg) ...@@ -1283,6 +1283,7 @@ static void _rpc_pid2jid(slurm_msg_t *msg)
if (stepd_pid_in_container(fd, req->job_pid) if (stepd_pid_in_container(fd, req->job_pid)
|| req->job_pid == stepd_daemon_pid(fd)) { || req->job_pid == stepd_daemon_pid(fd)) {
resp.job_id = stepd->jobid; resp.job_id = stepd->jobid;
resp.return_code = SLURM_SUCCESS;
found = true; found = true;
close(fd); close(fd);
break; break;
...@@ -1502,6 +1503,8 @@ done: ...@@ -1502,6 +1503,8 @@ done:
resp_msg.data = resp; resp_msg.data = resp;
resp_msg.msg_type = RESPONSE_REATTACH_TASKS; resp_msg.msg_type = RESPONSE_REATTACH_TASKS;
resp_msg.forward = msg->forward; resp_msg.forward = msg->forward;
resp_msg.forward_struct = msg->forward_struct;
resp_msg.forward_struct_init = msg->forward_struct_init;
resp_msg.ret_list = msg->ret_list; resp_msg.ret_list = msg->ret_list;
resp->node_name = xstrdup(conf->node_name); resp->node_name = xstrdup(conf->node_name);
resp->srun_node_id = nodeid; resp->srun_node_id = nodeid;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment