From 10661fd60f12b3e93e762a6e15b186aadc142b11 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@schedmd.com> Date: Tue, 12 Apr 2016 15:35:05 -0700 Subject: [PATCH] Rewrite of slurm_forward_data() to not leak memory and return list of nodes not able to contact. --- src/common/slurm_protocol_api.c | 53 ++++++++++++++++++++++++--------- src/common/slurm_protocol_api.h | 7 +++-- src/plugins/mpi/pmi2/ring.c | 2 +- 3 files changed, 44 insertions(+), 18 deletions(-) diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index b7ba19adbe7..2276fe3d41b 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -4639,45 +4639,70 @@ static void _print_data(char *data, int len) /* * slurm_forward_data - forward arbitrary data to unix domain sockets on nodes - * IN nodelist: nodes to forward data to + * IN/OUT nodelist: Nodes to forward data to (if failure this list is changed to + * reflect the failed nodes). * IN address: address of unix domain socket * IN len: length of data * IN data: real data * RET: error code */ -extern int -slurm_forward_data(char *nodelist, char *address, uint32_t len, char *data) +extern int slurm_forward_data( + char **nodelist, char *address, uint32_t len, const char *data) { List ret_list = NULL; int temp_rc = 0, rc = 0; ret_data_info_t *ret_data_info = NULL; - slurm_msg_t *msg = xmalloc(sizeof(slurm_msg_t)); + slurm_msg_t msg; forward_data_msg_t req; + hostlist_t hl = NULL; + bool redo_nodelist = false; + slurm_msg_t_init(&msg); - slurm_msg_t_init(msg); - - debug("slurm_forward_data: nodelist=%s, address=%s, len=%u", - nodelist, address, len); + debug2("slurm_forward_data: nodelist=%s, address=%s, len=%u", + *nodelist, address, len); req.address = address; req.len = len; - req.data = data; + req.data = (char *)data; + + msg.msg_type = REQUEST_FORWARD_DATA; + msg.data = &req; - msg->msg_type = REQUEST_FORWARD_DATA; - msg->data = &req; + if ((ret_list = slurm_send_recv_msgs(*nodelist, &msg, 0, false))) { + if (list_count(ret_list) > 1) + redo_nodelist = true; - if ((ret_list = slurm_send_recv_msgs(nodelist, msg, 0, false))) { while ((ret_data_info = list_pop(ret_list))) { temp_rc = slurm_get_return_code(ret_data_info->type, ret_data_info->data); - if (temp_rc) + if (temp_rc != SLURM_SUCCESS) { rc = temp_rc; + if (redo_nodelist) { + if (!hl) + hl = hostlist_create( + ret_data_info-> + node_name); + else + hostlist_push_host( + hl, ret_data_info-> + node_name); + } + } + destroy_data_info(ret_data_info); } } else { error("slurm_forward_data: no list was returned"); rc = SLURM_ERROR; } - slurm_free_msg(msg); + if (hl) { + xfree(*nodelist); + hostlist_sort(hl); + *nodelist = hostlist_ranged_string_xmalloc(hl); + hostlist_destroy(hl); + } + + FREE_NULL_LIST(ret_list); + return rc; } diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 1d97204c1e2..4a906f3d1b6 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -1290,14 +1290,15 @@ extern int slurm_job_step_create ( /* Should this be in <slurm/slurm.h> ? */ /* * slurm_forward_data - forward arbitrary data to unix domain sockets on nodes - * IN nodelist: nodes to forward data to + * IN/OUT nodelist: Nodes to forward data to (if failure this list is changed to + * reflect the failed nodes). * IN address: address of unix domain socket * IN len: length of data * IN data: real data * RET: error code */ -extern int slurm_forward_data(char *nodelist, char *address, uint32_t len, - char *data); +extern int slurm_forward_data( + char **nodelist, char *address, uint32_t len, const char *data); /* * slurm_setup_sockaddr - setup a sockaddr_in struct to be used for diff --git a/src/plugins/mpi/pmi2/ring.c b/src/plugins/mpi/pmi2/ring.c index 7a3118cd39d..dd6a5dfc06b 100644 --- a/src/plugins/mpi/pmi2/ring.c +++ b/src/plugins/mpi/pmi2/ring.c @@ -219,7 +219,7 @@ static int pmix_stepd_send(const char* buf, uint32_t size, int rank) int retries = 0; while (1) { /* attempt to send message */ - rc = slurm_forward_data(host, tree_sock_addr, size, (char*) buf); + rc = slurm_forward_data(&host, tree_sock_addr, size, buf); if (rc == SLURM_SUCCESS) { /* message sent successfully, we're done */ break; -- GitLab