diff --git a/NEWS b/NEWS index bbf8db8a42fb112052983d5423cb59e5f5aedd68..1624c7a9a585e679b06fd844f1d7318dcf8626e5 100644 --- a/NEWS +++ b/NEWS @@ -77,6 +77,7 @@ documents those changes that are of interest to users and admins. -- Comment out all of the logic in the job_submit/defaults plugin. The logic is only an example and not meant for actual use. -- Eliminate configuration file 4096 character line limitation. + -- More robust logic for tree message forward * Changes in SLURM 2.5.3 ======================== diff --git a/src/common/forward.c b/src/common/forward.c index 00db8ebe524e22577784d1afdb709fcb67cbcf8c..7e137226e64da32c5857eb41ac8887a7a4775c1f 100644 --- a/src/common/forward.c +++ b/src/common/forward.c @@ -360,11 +360,45 @@ void *_fwd_tree_thread(void *arg) xfree(send_msg.forward.nodelist); if (ret_list) { + int ret_cnt = list_count(ret_list); + /* This is most common if a slurmd is running + an older version of Slurm than the + originator of the message. + */ + if ((ret_cnt <= send_msg.forward.cnt) && + (errno != SLURM_COMMUNICATIONS_CONNECTION_ERROR)) { + error("fwd_tree_thread: %s failed to forward " + "the message, expecting %d ret got only " + "%d", + name, send_msg.forward.cnt + 1, ret_cnt); + if (ret_cnt > 1) { /* not likely */ + ret_data_info_t *ret_data_info = NULL; + ListIterator itr = + list_iterator_create(ret_list); + while ((ret_data_info = + list_next(itr))) { + if (strcmp(ret_data_info-> + node_name, name)) + hostlist_delete_host( + fwd_tree-> + tree_hl, + ret_data_info-> + node_name); + } + list_iterator_destroy(itr); + } + } + slurm_mutex_lock(fwd_tree->tree_mutex); list_transfer(fwd_tree->ret_list, ret_list); pthread_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); list_destroy(ret_list); + /* try next node */ + if (ret_cnt <= send_msg.forward.cnt) { + free(name); + continue; + } } else { /* This should never happen (when this was * written slurm_send_addr_recv_msgs always