Skip to content
Snippets Groups Projects
Commit 10661fd6 authored by Danny Auble's avatar Danny Auble
Browse files

Rewrite of slurm_forward_data() to not leak memory and return list of

nodes not able to contact.
parent 99b07b83
No related branches found
No related tags found
No related merge requests found
...@@ -4639,45 +4639,70 @@ static void _print_data(char *data, int len) ...@@ -4639,45 +4639,70 @@ static void _print_data(char *data, int len)
/* /*
* slurm_forward_data - forward arbitrary data to unix domain sockets on nodes * slurm_forward_data - forward arbitrary data to unix domain sockets on nodes
* IN nodelist: nodes to forward data to * IN/OUT nodelist: Nodes to forward data to (if failure this list is changed to
* reflect the failed nodes).
* IN address: address of unix domain socket * IN address: address of unix domain socket
* IN len: length of data * IN len: length of data
* IN data: real data * IN data: real data
* RET: error code * RET: error code
*/ */
extern int extern int slurm_forward_data(
slurm_forward_data(char *nodelist, char *address, uint32_t len, char *data) char **nodelist, char *address, uint32_t len, const char *data)
{ {
List ret_list = NULL; List ret_list = NULL;
int temp_rc = 0, rc = 0; int temp_rc = 0, rc = 0;
ret_data_info_t *ret_data_info = NULL; ret_data_info_t *ret_data_info = NULL;
slurm_msg_t *msg = xmalloc(sizeof(slurm_msg_t)); slurm_msg_t msg;
forward_data_msg_t req; forward_data_msg_t req;
hostlist_t hl = NULL;
bool redo_nodelist = false;
slurm_msg_t_init(&msg);
slurm_msg_t_init(msg); debug2("slurm_forward_data: nodelist=%s, address=%s, len=%u",
*nodelist, address, len);
debug("slurm_forward_data: nodelist=%s, address=%s, len=%u",
nodelist, address, len);
req.address = address; req.address = address;
req.len = len; req.len = len;
req.data = data; req.data = (char *)data;
msg.msg_type = REQUEST_FORWARD_DATA;
msg.data = &req;
msg->msg_type = REQUEST_FORWARD_DATA; if ((ret_list = slurm_send_recv_msgs(*nodelist, &msg, 0, false))) {
msg->data = &req; if (list_count(ret_list) > 1)
redo_nodelist = true;
if ((ret_list = slurm_send_recv_msgs(nodelist, msg, 0, false))) {
while ((ret_data_info = list_pop(ret_list))) { while ((ret_data_info = list_pop(ret_list))) {
temp_rc = slurm_get_return_code(ret_data_info->type, temp_rc = slurm_get_return_code(ret_data_info->type,
ret_data_info->data); ret_data_info->data);
if (temp_rc) if (temp_rc != SLURM_SUCCESS) {
rc = temp_rc; rc = temp_rc;
if (redo_nodelist) {
if (!hl)
hl = hostlist_create(
ret_data_info->
node_name);
else
hostlist_push_host(
hl, ret_data_info->
node_name);
}
}
destroy_data_info(ret_data_info);
} }
} else { } else {
error("slurm_forward_data: no list was returned"); error("slurm_forward_data: no list was returned");
rc = SLURM_ERROR; rc = SLURM_ERROR;
} }
slurm_free_msg(msg); if (hl) {
xfree(*nodelist);
hostlist_sort(hl);
*nodelist = hostlist_ranged_string_xmalloc(hl);
hostlist_destroy(hl);
}
FREE_NULL_LIST(ret_list);
return rc; return rc;
} }
......
...@@ -1290,14 +1290,15 @@ extern int slurm_job_step_create ( ...@@ -1290,14 +1290,15 @@ extern int slurm_job_step_create (
/* Should this be in <slurm/slurm.h> ? */ /* Should this be in <slurm/slurm.h> ? */
/* /*
* slurm_forward_data - forward arbitrary data to unix domain sockets on nodes * slurm_forward_data - forward arbitrary data to unix domain sockets on nodes
* IN nodelist: nodes to forward data to * IN/OUT nodelist: Nodes to forward data to (if failure this list is changed to
* reflect the failed nodes).
* IN address: address of unix domain socket * IN address: address of unix domain socket
* IN len: length of data * IN len: length of data
* IN data: real data * IN data: real data
* RET: error code * RET: error code
*/ */
extern int slurm_forward_data(char *nodelist, char *address, uint32_t len, extern int slurm_forward_data(
char *data); char **nodelist, char *address, uint32_t len, const char *data);
/* /*
* slurm_setup_sockaddr - setup a sockaddr_in struct to be used for * slurm_setup_sockaddr - setup a sockaddr_in struct to be used for
......
...@@ -219,7 +219,7 @@ static int pmix_stepd_send(const char* buf, uint32_t size, int rank) ...@@ -219,7 +219,7 @@ static int pmix_stepd_send(const char* buf, uint32_t size, int rank)
int retries = 0; int retries = 0;
while (1) { while (1) {
/* attempt to send message */ /* attempt to send message */
rc = slurm_forward_data(host, tree_sock_addr, size, (char*) buf); rc = slurm_forward_data(&host, tree_sock_addr, size, buf);
if (rc == SLURM_SUCCESS) { if (rc == SLURM_SUCCESS) {
/* message sent successfully, we're done */ /* message sent successfully, we're done */
break; break;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment