diff --git a/NEWS b/NEWS index 276e2f498231932600255cb2ffa34545bb9fb3e5..f2ae840e4ed0a07592d0ea734234884e08a17bdc 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,10 @@ documents those changes that are of interest to users and admins. -- In select/cons_res - fix for function argument type mis-match in getting CPU count for a job, from Ernest Artiaga, BSC. -- In sched/wiki2 - Report job's tasks_per_node requirement. + -- In forward logic fix to check if the forwarding node recieves a connection + but doesn't ever get the message from the sender (network issue or + something) also check to make sure if we get something back we make sure + we account for everything we sent out before we call it good. * Changes in SLURM 1.2.3 ======================== diff --git a/src/common/forward.c b/src/common/forward.c index 6b82c0479f414fba12c06d0721327f70780b4451..58eee32d3cb239c792ccda8a883813788f681590 100644 --- a/src/common/forward.c +++ b/src/common/forward.c @@ -109,7 +109,9 @@ void *_forward_thread(void *arg) xfree(fwd_msg->header.forward.nodelist); fwd_msg->header.forward.nodelist = xstrdup(buf); fwd_msg->header.forward.cnt = hostlist_count(hl); - + /* info("sending %d forwards (%s) to %s", */ +/* fwd_msg->header.forward.cnt, */ +/* fwd_msg->header.forward.nodelist, name); */ if (fwd_msg->header.forward.nodelist[0]) { debug3("forward: send to %s along with %s", name, fwd_msg->header.forward.nodelist); @@ -142,7 +144,7 @@ void *_forward_thread(void *arg) mark_as_failed_forward(&fwd_msg->ret_list, name, errno); free(name); - if (hostlist_count(hl) > 0) { + if(hostlist_count(hl) > 0) { free_buf(buffer); buffer = init_buf(0); slurm_mutex_unlock(fwd_msg->forward_mutex); @@ -169,7 +171,7 @@ void *_forward_thread(void *arg) goto cleanup; } - if(fwd_msg->header.forward.cnt>0) { + if(fwd_msg->header.forward.cnt > 0) { steps = (fwd_msg->header.forward.cnt+1) / slurm_get_tree_width(); fwd_msg->timeout = (FORWARD_EXTRA_STEP_WAIT_MS*steps); @@ -178,9 +180,11 @@ void *_forward_thread(void *arg) } ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout); - + /* info("sent %d forwards got %d back", */ +/* fwd_msg->header.forward.cnt, list_count(ret_list)); */ + if(!ret_list || (fwd_msg->header.forward.cnt != 0 - && list_count(ret_list) == 0)) { + && list_count(ret_list) <= 1)) { slurm_mutex_lock(fwd_msg->forward_mutex); mark_as_failed_forward(&fwd_msg->ret_list, name, errno); @@ -194,6 +198,52 @@ void *_forward_thread(void *arg) continue; } goto cleanup; + } else if((fwd_msg->header.forward.cnt+1) + != list_count(ret_list)) { + /* this should never be called since the above + should catch the failed forwards and pipe + them back down, but this is here so we + never have to worry about a locked + mutex */ + ListIterator itr = NULL; + char *tmp = NULL; + int first_node_found = 0; + hostlist_iterator_t host_itr + = hostlist_iterator_create(hl); + error("We shouldn't be here. We forwarded to %d " + "but only got %d back", + (fwd_msg->header.forward.cnt+1), + list_count(ret_list)); + while((tmp = hostlist_next(host_itr))) { + int node_found = 0; + itr = list_iterator_create(ret_list); + while((ret_data_info = list_next(itr))) { + if(!ret_data_info->node_name) { + first_node_found = 1; + ret_data_info->node_name = + xstrdup(name); + } + if(!strcmp(tmp, + ret_data_info->node_name)) { + node_found = 1; + break; + } + } + list_iterator_destroy(itr); + if(!node_found) { + mark_as_failed_forward( + &fwd_msg->ret_list, + tmp, + SLURM_ERROR); + } + free(tmp); + } + hostlist_iterator_destroy(host_itr); + if(!first_node_found) { + mark_as_failed_forward(&fwd_msg->ret_list, + name, + SLURM_ERROR); + } } break; } diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index fdc2b7120ed96da964142ff13d975f526c27e266..d1fdc19f2254d546772a0225c3033e8adf6495db 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -933,7 +933,7 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout) slurm_msg_t_init(&msg); msg.conn_fd = fd; - if (timeout <= 0) { + if(timeout <= 0) { /* convert secs to msec */ timeout = slurm_get_msg_timeout() * 1000; orig_timeout = timeout; @@ -964,7 +964,7 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout) * length and allocate space on the heap for a buffer containing * the message. */ - if (_slurm_msg_recvfrom_timeout(fd, &buf, &buflen, 0, timeout) < 0) { + if(_slurm_msg_recvfrom_timeout(fd, &buf, &buflen, 0, timeout) < 0) { forward_init(&header.forward, NULL); rc = errno; goto total_return; @@ -981,7 +981,7 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout) goto total_return; } - if (check_header_version(&header) < 0) { + if(check_header_version(&header) < 0) { free_buf(buffer); rc = SLURM_PROTOCOL_VERSION_ERROR; goto total_return; @@ -1002,7 +1002,7 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout) "slurm_receive_and_forward_msgs instead"); } - if ((auth_cred = g_slurm_auth_unpack(buffer)) == NULL) { + if((auth_cred = g_slurm_auth_unpack(buffer)) == NULL) { error( "authentication: %s ", g_slurm_auth_errstr(g_slurm_auth_errno(NULL))); free_buf(buffer); @@ -1011,9 +1011,9 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout) } rc = g_slurm_auth_verify( auth_cred, NULL, 2 ); - if (rc != SLURM_SUCCESS) { - error( "authentication: %s ", - g_slurm_auth_errstr(g_slurm_auth_errno(auth_cred))); + if(rc != SLURM_SUCCESS) { + error("authentication: %s ", + g_slurm_auth_errstr(g_slurm_auth_errno(auth_cred))); (void) g_slurm_auth_destroy(auth_cred); free_buf(buffer); rc = SLURM_PROTOCOL_AUTHENTICATION_ERROR; @@ -1726,7 +1726,7 @@ _send_and_recv_msg(slurm_fd fd, slurm_msg_t *req, /* no need to adjust and timeouts here since we are not forwarding or expecting anything other than 1 message and the regular timeout will be altered in - slurm_recieve_msg if it is 0 */ + slurm_receive_msg if it is 0 */ rc = slurm_receive_msg(fd, resp, timeout); }