From d40ca77e97d5cc45be5581609ee2184ed5c54e91 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Wed, 16 Oct 2002 16:22:37 +0000
Subject: [PATCH] Require that all nodes in a partition on a computer with Elan
 have the same processor count (as used for scheduling, configured or actual).
 This will be required until the Elan interface is more sophisticated.

---
 src/slurmctld/node_mgr.c    | 13 ++++++++
 src/slurmctld/read_config.c | 59 ++++++++++++++++++++++++++++++-------
 2 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
index 61a4f27e76b..2d0420ecb21 100644
--- a/src/slurmctld/node_mgr.c
+++ b/src/slurmctld/node_mgr.c
@@ -1051,12 +1051,25 @@ validate_node_specs (char *node_name, uint32_t cpus,
 	}
 	else {
 		info ("validate_node_specs: node %s has registered", node_name);
+		node_ptr->cpus = cpus;
+		node_ptr->real_memory = real_memory;
+		node_ptr->tmp_disk = tmp_disk;
+#ifdef 		HAVE_LIBELAN3
+		/* Every node in a given partition must have the same processor count at present */
+		if ((slurmctld_conf.fast_schedule == 0) &&
+		    (node_ptr->config_ptr->cpus != cpus)) {
+			error ("Node %s has processor count inconsistent with rest of partition",
+				node_name);
+			return EINVAL;		/* leave node down */
+		}
+#endif
 		resp_state = node_ptr->node_state & NODE_STATE_NO_RESPOND;
 		node_ptr->node_state &= (uint16_t) (~NODE_STATE_NO_RESPOND);
 		if (node_ptr->node_state == NODE_STATE_UNKNOWN)
 			node_ptr->node_state = NODE_STATE_IDLE;
 		else if ((node_ptr->node_state == NODE_STATE_DOWN) &&
 		         (slurmctld_conf.ret2service == 1)) {
+
 			if (job_count)
 				node_ptr->node_state = NODE_STATE_ALLOCATED;
 			else
diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c
index 4e1249e3508..c0688f0d628 100644
--- a/src/slurmctld/read_config.c
+++ b/src/slurmctld/read_config.c
@@ -48,10 +48,13 @@
 
 #define BUF_SIZE 1024
 
-static int	init_slurm_conf ();
+static int	init_slurm_conf (void);
 static int	parse_config_spec (char *in_line);
 static int	parse_node_spec (char *in_line);
 static int	parse_part_spec (char *in_line);
+#ifdef 	HAVE_LIBELAN3
+static void	validate_node_proc_count (void);
+#endif
 
 static char highest_node_name[MAX_NAME_LEN] = "";
 int node_record_count = 0;
@@ -242,8 +245,8 @@ static int
 parse_config_spec (char *in_line) 
 {
 	int error_code;
-	int fast_schedule = 0, hash_base = 0, heartbeat_interval = 0, kill_wait = 0;
-	int ret2service = 0, slurmctld_timeout = 0, slurmd_timeout = 0;
+	int fast_schedule = -1, hash_base = -1, heartbeat_interval = -1, kill_wait = -1;
+	int ret2service = -1, slurmctld_timeout = -1, slurmd_timeout = -1;
 	char *backup_controller = NULL, *control_machine = NULL, *epilog = NULL;
 	char *prioritize = NULL, *prolog = NULL, *state_save_location = NULL, *tmp_fs = NULL;
 	char *slurmctld_port = NULL, *slurmd_port = NULL;
@@ -294,19 +297,19 @@ parse_config_spec (char *in_line)
 		slurmctld_conf.epilog = epilog;
 	}
 
-	if ( fast_schedule ) 
+	if ( fast_schedule != -1) 
 		slurmctld_conf.fast_schedule = fast_schedule;
 
 	if ( first_job_id ) 
 		slurmctld_conf.first_job_id = first_job_id;
 
-	if ( hash_base ) 
+	if ( hash_base != -1) 
 		slurmctld_conf.hash_base = hash_base;
 
-	if ( heartbeat_interval ) 
+	if ( heartbeat_interval != -1) 
 		slurmctld_conf.heartbeat_interval = heartbeat_interval;
 
-	if ( kill_wait ) 
+	if ( kill_wait != -1) 
 		slurmctld_conf.kill_wait = kill_wait;
 
 	if ( prioritize ) {
@@ -321,7 +324,7 @@ parse_config_spec (char *in_line)
 		slurmctld_conf.prolog = prolog;
 	}
 
-	if ( ret2service ) 
+	if ( ret2service != -1) 
 		slurmctld_conf.ret2service = ret2service;
 
 	if ( slurmctld_port ) {
@@ -333,7 +336,7 @@ parse_config_spec (char *in_line)
 		endservent ();
 	}
 
-	if ( slurmctld_timeout ) 
+	if ( slurmctld_timeout != -1) 
 		slurmctld_conf.slurmctld_timeout = slurmctld_timeout;
 
 	if ( slurmd_port ) {
@@ -345,7 +348,7 @@ parse_config_spec (char *in_line)
 		endservent ();
 	}
 
-	if ( slurmd_timeout ) 
+	if ( slurmd_timeout != -1) 
 		slurmctld_conf.slurmd_timeout = slurmd_timeout;
 
 	if ( state_save_location ) {
@@ -866,6 +869,9 @@ read_slurm_conf (int recover) {
 
 	if ((error_code = build_bitmaps ()))
 		return error_code;
+#ifdef 	HAVE_LIBELAN3
+	validate_node_proc_count ();
+#endif
 	if (recover) {
 		(void) sync_nodes_to_jobs ();
 	}
@@ -923,3 +929,36 @@ sync_nodes_to_jobs (void)
 		info ("sync_nodes_to_jobs updated state of %d nodes", update_cnt);
 	return update_cnt;
 }
+
+#ifdef 	HAVE_LIBELAN3
+/* Every node in a given partition must have the same processor count at present */
+void validate_node_proc_count (void)
+{
+	ListIterator part_record_iterator;
+	struct part_record *part_record_point;
+	int first_bit, last_bit, i, node_size, part_size;
+
+	part_record_iterator = list_iterator_create (part_list);		
+	while ((part_record_point = (struct part_record *) list_next (part_record_iterator))) {
+		first_bit = bit_ffs (part_record_point->node_bitmap);
+		last_bit  = bit_fls (part_record_point->node_bitmap);
+		part_size = -1;
+		for (i=first_bit; i<=last_bit; i++) {
+			if (bit_test (part_record_point->node_bitmap, i) == 0)
+				continue;
+
+			if (slurmctld_conf.fast_schedule)
+				node_size = node_record_table_ptr[i].config_ptr->cpus;
+			else 
+				node_size = node_record_table_ptr[i].cpus;
+
+			if (part_size == -1)
+				part_size = node_size;
+			else if (part_size != node_size)
+				fatal ("Partition %s has inconsisent processor count",
+					part_record_point->name);
+		}
+	}			
+	list_iterator_destroy (part_record_iterator);
+}
+#endif
-- 
GitLab