diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index b7ba19adbe74b26c53d2634b42f3492fac0b1221..2276fe3d41b854f4f53cb354a9f371bf180be930 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -4639,45 +4639,70 @@ static void _print_data(char *data, int len) /* * slurm_forward_data - forward arbitrary data to unix domain sockets on nodes - * IN nodelist: nodes to forward data to + * IN/OUT nodelist: Nodes to forward data to (if failure this list is changed to + * reflect the failed nodes). * IN address: address of unix domain socket * IN len: length of data * IN data: real data * RET: error code */ -extern int -slurm_forward_data(char *nodelist, char *address, uint32_t len, char *data) +extern int slurm_forward_data( + char **nodelist, char *address, uint32_t len, const char *data) { List ret_list = NULL; int temp_rc = 0, rc = 0; ret_data_info_t *ret_data_info = NULL; - slurm_msg_t *msg = xmalloc(sizeof(slurm_msg_t)); + slurm_msg_t msg; forward_data_msg_t req; + hostlist_t hl = NULL; + bool redo_nodelist = false; + slurm_msg_t_init(&msg); - slurm_msg_t_init(msg); - - debug("slurm_forward_data: nodelist=%s, address=%s, len=%u", - nodelist, address, len); + debug2("slurm_forward_data: nodelist=%s, address=%s, len=%u", + *nodelist, address, len); req.address = address; req.len = len; - req.data = data; + req.data = (char *)data; + + msg.msg_type = REQUEST_FORWARD_DATA; + msg.data = &req; - msg->msg_type = REQUEST_FORWARD_DATA; - msg->data = &req; + if ((ret_list = slurm_send_recv_msgs(*nodelist, &msg, 0, false))) { + if (list_count(ret_list) > 1) + redo_nodelist = true; - if ((ret_list = slurm_send_recv_msgs(nodelist, msg, 0, false))) { while ((ret_data_info = list_pop(ret_list))) { temp_rc = slurm_get_return_code(ret_data_info->type, ret_data_info->data); - if (temp_rc) + if (temp_rc != SLURM_SUCCESS) { rc = temp_rc; + if (redo_nodelist) { + if (!hl) + hl = hostlist_create( + ret_data_info-> + node_name); + else + hostlist_push_host( + hl, ret_data_info-> + node_name); + } + } + destroy_data_info(ret_data_info); } } else { error("slurm_forward_data: no list was returned"); rc = SLURM_ERROR; } - slurm_free_msg(msg); + if (hl) { + xfree(*nodelist); + hostlist_sort(hl); + *nodelist = hostlist_ranged_string_xmalloc(hl); + hostlist_destroy(hl); + } + + FREE_NULL_LIST(ret_list); + return rc; } diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 1d97204c1e2823f7b08e119a322c6685143091b3..4a906f3d1b6faefd4139fe5563fb7f3fd40f51a5 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -1290,14 +1290,15 @@ extern int slurm_job_step_create ( /* Should this be in <slurm/slurm.h> ? */ /* * slurm_forward_data - forward arbitrary data to unix domain sockets on nodes - * IN nodelist: nodes to forward data to + * IN/OUT nodelist: Nodes to forward data to (if failure this list is changed to + * reflect the failed nodes). * IN address: address of unix domain socket * IN len: length of data * IN data: real data * RET: error code */ -extern int slurm_forward_data(char *nodelist, char *address, uint32_t len, - char *data); +extern int slurm_forward_data( + char **nodelist, char *address, uint32_t len, const char *data); /* * slurm_setup_sockaddr - setup a sockaddr_in struct to be used for diff --git a/src/plugins/mpi/pmi2/ring.c b/src/plugins/mpi/pmi2/ring.c index 7a3118cd39dfe8676e3dd4e70276dd843ee1cea6..dd6a5dfc06bab8bcf93252713eb28f51bcd61de6 100644 --- a/src/plugins/mpi/pmi2/ring.c +++ b/src/plugins/mpi/pmi2/ring.c @@ -219,7 +219,7 @@ static int pmix_stepd_send(const char* buf, uint32_t size, int rank) int retries = 0; while (1) { /* attempt to send message */ - rc = slurm_forward_data(host, tree_sock_addr, size, (char*) buf); + rc = slurm_forward_data(&host, tree_sock_addr, size, buf); if (rc == SLURM_SUCCESS) { /* message sent successfully, we're done */ break;