From 9a6148430269e191ce9b648fca101c98dbcd65cc Mon Sep 17 00:00:00 2001
From: Danny Auble <da@llnl.gov>
Date: Thu, 10 Sep 2009 18:04:48 +0000
Subject: [PATCH] If a slurmd does not have a node listed in it's slurm.conf
 (slurm.conf's should be kept the same on all nodes) an error message is
 printed in the slurmctld log along with the message already being printed in
 the slurmd log for easier debugging.

---
 NEWS                     |  4 ++++
 slurm/slurm_errno.h      |  1 +
 src/common/forward.c     | 14 ++++++++------
 src/common/slurm_errno.c |  4 +++-
 src/slurmctld/agent.c    |  7 +++++++
 5 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/NEWS b/NEWS
index fcb5e238f0f..08057a55b07 100644
--- a/NEWS
+++ b/NEWS
@@ -87,6 +87,10 @@ documents those changes that are of interest to users and admins.
     plugins.
  -- QOS support added with limits, priority and preemption
     (no documentation yet).
+ -- If a slurmd does not have a node listed in it's slurm.conf (slurm.conf's 
+    should be kept the same on all nodes) an error message is printed in the 
+    slurmctld log along with the message already being printed in the slurmd 
+    log for easier debugging.
 
 * Changes in SLURM 2.1.0-pre2
 =============================
diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h
index b1e6eafc773..532cd1e73b2 100644
--- a/slurm/slurm_errno.h
+++ b/slurm/slurm_errno.h
@@ -99,6 +99,7 @@ enum {
 	SLURM_MPI_PLUGIN_NAME_INVALID,
 	SLURM_MPI_PLUGIN_PRELAUNCH_SETUP_FAILED,
 	SLURM_PLUGIN_NAME_INVALID,
+	SLURM_UNKNOWN_FORWARD_ADDR,
 
 	/* communication failures to/from slurmctld */
 	SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR =     1800,
diff --git a/src/common/forward.c b/src/common/forward.c
index d7ecdfeb690..7ccce549d3c 100644
--- a/src/common/forward.c
+++ b/src/common/forward.c
@@ -102,7 +102,7 @@ void *_forward_thread(void *arg)
 			      "%s, check slurm.conf", name);
 			slurm_mutex_lock(fwd_msg->forward_mutex);
 			mark_as_failed_forward(&fwd_msg->ret_list, name,
-					SLURM_COMMUNICATIONS_CONNECTION_ERROR);
+					       SLURM_UNKNOWN_FORWARD_ADDR);
  			free(name);
 			if (hostlist_count(hl) > 0) {
 				slurm_mutex_unlock(fwd_msg->forward_mutex);
@@ -114,8 +114,9 @@ void *_forward_thread(void *arg)
 			error("forward_thread to %s: %m", name);
 
 			slurm_mutex_lock(fwd_msg->forward_mutex);
-			mark_as_failed_forward(&fwd_msg->ret_list, name,
-					SLURM_COMMUNICATIONS_CONNECTION_ERROR);
+			mark_as_failed_forward(
+				&fwd_msg->ret_list, name,
+				SLURM_COMMUNICATIONS_CONNECTION_ERROR);
 			free(name);
 			if (hostlist_count(hl) > 0) {
 				slurm_mutex_unlock(fwd_msg->forward_mutex);
@@ -321,7 +322,7 @@ void *_fwd_tree_thread(void *arg)
 			      "%s, check slurm.conf", name);
 			slurm_mutex_lock(fwd_tree->tree_mutex);
 			mark_as_failed_forward(&fwd_tree->ret_list, name,
-					SLURM_COMMUNICATIONS_CONNECTION_ERROR);
+					       SLURM_UNKNOWN_FORWARD_ADDR);
  			pthread_cond_signal(fwd_tree->notify);
 			slurm_mutex_unlock(fwd_tree->tree_mutex);
 			free(name);
@@ -361,8 +362,9 @@ void *_fwd_tree_thread(void *arg)
 			error("fwd_tree_thread: no return list given from "
 			      "slurm_send_addr_recv_msgs", name);
 			slurm_mutex_lock(fwd_tree->tree_mutex);
-			mark_as_failed_forward(&fwd_tree->ret_list, name,
-					SLURM_COMMUNICATIONS_CONNECTION_ERROR);
+			mark_as_failed_forward(
+				&fwd_tree->ret_list, name,
+				SLURM_COMMUNICATIONS_CONNECTION_ERROR);
  			pthread_cond_signal(fwd_tree->notify);
 			slurm_mutex_unlock(fwd_tree->tree_mutex);
 			free(name);
diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c
index 8cb77ddc222..70cd5cc8e0c 100644
--- a/src/common/slurm_errno.c
+++ b/src/common/slurm_errno.c
@@ -70,7 +70,7 @@ static slurm_errtab_t slurm_errtab[] = {
 	{-1, "Unspecified error"},
 	{EINPROGRESS, "Operation now in progress"},
 
-	/*General Message error codes */
+	/* General Message error codes */
 	{ SLURM_UNEXPECTED_MSG_ERROR, 
 	  "Unexpected message received" 			},
 	{ SLURM_COMMUNICATIONS_CONNECTION_ERROR,
@@ -95,6 +95,8 @@ static slurm_errtab_t slurm_errtab[] = {
 	  "MPI plugin's pre-launch setup failed"                },
 	{ SLURM_PLUGIN_NAME_INVALID,
 	  "Plugin initialization failed"			},
+	{ SLURM_UNKNOWN_FORWARD_ADDR,
+	  "Can't find an address, check slurm.conf"		},
 
 	/* communication failures to/from slurmctld */
 	{ SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR,
diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c
index cda9a896835..4b0bbc691c8 100644
--- a/src/slurmctld/agent.c
+++ b/src/slurmctld/agent.c
@@ -939,6 +939,13 @@ static void *_thread_per_group_rpc(void *args)
 			  ret_data_info->node_name); */
 			thread_state = DSH_DONE;
 			break;
+		case SLURM_UNKNOWN_FORWARD_ADDR:
+			error("We were unable to forward message to '%s'.  "
+			      "Make sure the slurm.conf for each slurmd "
+			      "contain all other nodes in your system.",
+			      ret_data_info->node_name);
+			thread_state = DSH_NO_RESP;
+			break;
 		case ESLURMD_EPILOG_FAILED:
 			error("Epilog failure on host %s, "
 			      "setting DOWN", 
-- 
GitLab