Skip to content
Snippets Groups Projects
Commit 4ad852dc authored by Danny Auble's avatar Danny Auble
Browse files

fix for forward logic to not handle a failed send correctly.

parent 54727cc3
No related branches found
No related tags found
No related merge requests found
......@@ -6,6 +6,10 @@ documents those changes that are of interest to users and admins.
-- In select/cons_res - fix for function argument type mis-match in getting
CPU count for a job, from Ernest Artiaga, BSC.
-- In sched/wiki2 - Report job's tasks_per_node requirement.
-- In forward logic fix to check if the forwarding node recieves a connection
but doesn't ever get the message from the sender (network issue or
something) also check to make sure if we get something back we make sure
we account for everything we sent out before we call it good.
* Changes in SLURM 1.2.3
========================
......
......@@ -109,7 +109,9 @@ void *_forward_thread(void *arg)
xfree(fwd_msg->header.forward.nodelist);
fwd_msg->header.forward.nodelist = xstrdup(buf);
fwd_msg->header.forward.cnt = hostlist_count(hl);
/* info("sending %d forwards (%s) to %s", */
/* fwd_msg->header.forward.cnt, */
/* fwd_msg->header.forward.nodelist, name); */
if (fwd_msg->header.forward.nodelist[0]) {
debug3("forward: send to %s along with %s",
name, fwd_msg->header.forward.nodelist);
......@@ -142,7 +144,7 @@ void *_forward_thread(void *arg)
mark_as_failed_forward(&fwd_msg->ret_list, name,
errno);
free(name);
if (hostlist_count(hl) > 0) {
if(hostlist_count(hl) > 0) {
free_buf(buffer);
buffer = init_buf(0);
slurm_mutex_unlock(fwd_msg->forward_mutex);
......@@ -169,7 +171,7 @@ void *_forward_thread(void *arg)
goto cleanup;
}
if(fwd_msg->header.forward.cnt>0) {
if(fwd_msg->header.forward.cnt > 0) {
steps = (fwd_msg->header.forward.cnt+1) /
slurm_get_tree_width();
fwd_msg->timeout = (FORWARD_EXTRA_STEP_WAIT_MS*steps);
......@@ -178,9 +180,11 @@ void *_forward_thread(void *arg)
}
ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout);
/* info("sent %d forwards got %d back", */
/* fwd_msg->header.forward.cnt, list_count(ret_list)); */
if(!ret_list || (fwd_msg->header.forward.cnt != 0
&& list_count(ret_list) == 0)) {
&& list_count(ret_list) <= 1)) {
slurm_mutex_lock(fwd_msg->forward_mutex);
mark_as_failed_forward(&fwd_msg->ret_list, name,
errno);
......@@ -194,6 +198,52 @@ void *_forward_thread(void *arg)
continue;
}
goto cleanup;
} else if((fwd_msg->header.forward.cnt+1)
!= list_count(ret_list)) {
/* this should never be called since the above
should catch the failed forwards and pipe
them back down, but this is here so we
never have to worry about a locked
mutex */
ListIterator itr = NULL;
char *tmp = NULL;
int first_node_found = 0;
hostlist_iterator_t host_itr
= hostlist_iterator_create(hl);
error("We shouldn't be here. We forwarded to %d "
"but only got %d back",
(fwd_msg->header.forward.cnt+1),
list_count(ret_list));
while((tmp = hostlist_next(host_itr))) {
int node_found = 0;
itr = list_iterator_create(ret_list);
while((ret_data_info = list_next(itr))) {
if(!ret_data_info->node_name) {
first_node_found = 1;
ret_data_info->node_name =
xstrdup(name);
}
if(!strcmp(tmp,
ret_data_info->node_name)) {
node_found = 1;
break;
}
}
list_iterator_destroy(itr);
if(!node_found) {
mark_as_failed_forward(
&fwd_msg->ret_list,
tmp,
SLURM_ERROR);
}
free(tmp);
}
hostlist_iterator_destroy(host_itr);
if(!first_node_found) {
mark_as_failed_forward(&fwd_msg->ret_list,
name,
SLURM_ERROR);
}
}
break;
}
......
......@@ -933,7 +933,7 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout)
slurm_msg_t_init(&msg);
msg.conn_fd = fd;
if (timeout <= 0) {
if(timeout <= 0) {
/* convert secs to msec */
timeout = slurm_get_msg_timeout() * 1000;
orig_timeout = timeout;
......@@ -964,7 +964,7 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout)
* length and allocate space on the heap for a buffer containing
* the message.
*/
if (_slurm_msg_recvfrom_timeout(fd, &buf, &buflen, 0, timeout) < 0) {
if(_slurm_msg_recvfrom_timeout(fd, &buf, &buflen, 0, timeout) < 0) {
forward_init(&header.forward, NULL);
rc = errno;
goto total_return;
......@@ -981,7 +981,7 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout)
goto total_return;
}
if (check_header_version(&header) < 0) {
if(check_header_version(&header) < 0) {
free_buf(buffer);
rc = SLURM_PROTOCOL_VERSION_ERROR;
goto total_return;
......@@ -1002,7 +1002,7 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout)
"slurm_receive_and_forward_msgs instead");
}
if ((auth_cred = g_slurm_auth_unpack(buffer)) == NULL) {
if((auth_cred = g_slurm_auth_unpack(buffer)) == NULL) {
error( "authentication: %s ",
g_slurm_auth_errstr(g_slurm_auth_errno(NULL)));
free_buf(buffer);
......@@ -1011,9 +1011,9 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout)
}
rc = g_slurm_auth_verify( auth_cred, NULL, 2 );
if (rc != SLURM_SUCCESS) {
error( "authentication: %s ",
g_slurm_auth_errstr(g_slurm_auth_errno(auth_cred)));
if(rc != SLURM_SUCCESS) {
error("authentication: %s ",
g_slurm_auth_errstr(g_slurm_auth_errno(auth_cred)));
(void) g_slurm_auth_destroy(auth_cred);
free_buf(buffer);
rc = SLURM_PROTOCOL_AUTHENTICATION_ERROR;
......@@ -1726,7 +1726,7 @@ _send_and_recv_msg(slurm_fd fd, slurm_msg_t *req,
/* no need to adjust and timeouts here since we are not
forwarding or expecting anything other than 1 message
and the regular timeout will be altered in
slurm_recieve_msg if it is 0 */
slurm_receive_msg if it is 0 */
rc = slurm_receive_msg(fd, resp, timeout);
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment