diff --git a/NEWS b/NEWS index fcb5e238f0f6904b48ca73185aadd05bb5da6728..08057a55b07565d62cfa7e679426d9e817c398e1 100644 --- a/NEWS +++ b/NEWS @@ -87,6 +87,10 @@ documents those changes that are of interest to users and admins. plugins. -- QOS support added with limits, priority and preemption (no documentation yet). + -- If a slurmd does not have a node listed in it's slurm.conf (slurm.conf's + should be kept the same on all nodes) an error message is printed in the + slurmctld log along with the message already being printed in the slurmd + log for easier debugging. * Changes in SLURM 2.1.0-pre2 ============================= diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h index b1e6eafc773a02440b3057691c3afdb7480e92db..532cd1e73b231704ac22e1d6047807d4bea6ce89 100644 --- a/slurm/slurm_errno.h +++ b/slurm/slurm_errno.h @@ -99,6 +99,7 @@ enum { SLURM_MPI_PLUGIN_NAME_INVALID, SLURM_MPI_PLUGIN_PRELAUNCH_SETUP_FAILED, SLURM_PLUGIN_NAME_INVALID, + SLURM_UNKNOWN_FORWARD_ADDR, /* communication failures to/from slurmctld */ SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR = 1800, diff --git a/src/common/forward.c b/src/common/forward.c index d7ecdfeb690610b8241119a7bdfe530e265a876e..7ccce549d3c552eae66ff87224320042925a1915 100644 --- a/src/common/forward.c +++ b/src/common/forward.c @@ -102,7 +102,7 @@ void *_forward_thread(void *arg) "%s, check slurm.conf", name); slurm_mutex_lock(fwd_msg->forward_mutex); mark_as_failed_forward(&fwd_msg->ret_list, name, - SLURM_COMMUNICATIONS_CONNECTION_ERROR); + SLURM_UNKNOWN_FORWARD_ADDR); free(name); if (hostlist_count(hl) > 0) { slurm_mutex_unlock(fwd_msg->forward_mutex); @@ -114,8 +114,9 @@ void *_forward_thread(void *arg) error("forward_thread to %s: %m", name); slurm_mutex_lock(fwd_msg->forward_mutex); - mark_as_failed_forward(&fwd_msg->ret_list, name, - SLURM_COMMUNICATIONS_CONNECTION_ERROR); + mark_as_failed_forward( + &fwd_msg->ret_list, name, + SLURM_COMMUNICATIONS_CONNECTION_ERROR); free(name); if (hostlist_count(hl) > 0) { slurm_mutex_unlock(fwd_msg->forward_mutex); @@ -321,7 +322,7 @@ void *_fwd_tree_thread(void *arg) "%s, check slurm.conf", name); slurm_mutex_lock(fwd_tree->tree_mutex); mark_as_failed_forward(&fwd_tree->ret_list, name, - SLURM_COMMUNICATIONS_CONNECTION_ERROR); + SLURM_UNKNOWN_FORWARD_ADDR); pthread_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); free(name); @@ -361,8 +362,9 @@ void *_fwd_tree_thread(void *arg) error("fwd_tree_thread: no return list given from " "slurm_send_addr_recv_msgs", name); slurm_mutex_lock(fwd_tree->tree_mutex); - mark_as_failed_forward(&fwd_tree->ret_list, name, - SLURM_COMMUNICATIONS_CONNECTION_ERROR); + mark_as_failed_forward( + &fwd_tree->ret_list, name, + SLURM_COMMUNICATIONS_CONNECTION_ERROR); pthread_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); free(name); diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index 8cb77ddc22250c1885a46ac592999149b9a5ca2d..70cd5cc8e0c263c14e8c333c705db1cf46793a0e 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -70,7 +70,7 @@ static slurm_errtab_t slurm_errtab[] = { {-1, "Unspecified error"}, {EINPROGRESS, "Operation now in progress"}, - /*General Message error codes */ + /* General Message error codes */ { SLURM_UNEXPECTED_MSG_ERROR, "Unexpected message received" }, { SLURM_COMMUNICATIONS_CONNECTION_ERROR, @@ -95,6 +95,8 @@ static slurm_errtab_t slurm_errtab[] = { "MPI plugin's pre-launch setup failed" }, { SLURM_PLUGIN_NAME_INVALID, "Plugin initialization failed" }, + { SLURM_UNKNOWN_FORWARD_ADDR, + "Can't find an address, check slurm.conf" }, /* communication failures to/from slurmctld */ { SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR, diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index cda9a896835ffa4cb514f87a8f0bd9b7dcddba97..4b0bbc691c8cfa46c318b3227ceefbf28583409d 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -939,6 +939,13 @@ static void *_thread_per_group_rpc(void *args) ret_data_info->node_name); */ thread_state = DSH_DONE; break; + case SLURM_UNKNOWN_FORWARD_ADDR: + error("We were unable to forward message to '%s'. " + "Make sure the slurm.conf for each slurmd " + "contain all other nodes in your system.", + ret_data_info->node_name); + thread_state = DSH_NO_RESP; + break; case ESLURMD_EPILOG_FAILED: error("Epilog failure on host %s, " "setting DOWN",