From d40ca77e97d5cc45be5581609ee2184ed5c54e91 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 16 Oct 2002 16:22:37 +0000 Subject: [PATCH] Require that all nodes in a partition on a computer with Elan have the same processor count (as used for scheduling, configured or actual). This will be required until the Elan interface is more sophisticated. --- src/slurmctld/node_mgr.c | 13 ++++++++ src/slurmctld/read_config.c | 59 ++++++++++++++++++++++++++++++------- 2 files changed, 62 insertions(+), 10 deletions(-) diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 61a4f27e76b..2d0420ecb21 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1051,12 +1051,25 @@ validate_node_specs (char *node_name, uint32_t cpus, } else { info ("validate_node_specs: node %s has registered", node_name); + node_ptr->cpus = cpus; + node_ptr->real_memory = real_memory; + node_ptr->tmp_disk = tmp_disk; +#ifdef HAVE_LIBELAN3 + /* Every node in a given partition must have the same processor count at present */ + if ((slurmctld_conf.fast_schedule == 0) && + (node_ptr->config_ptr->cpus != cpus)) { + error ("Node %s has processor count inconsistent with rest of partition", + node_name); + return EINVAL; /* leave node down */ + } +#endif resp_state = node_ptr->node_state & NODE_STATE_NO_RESPOND; node_ptr->node_state &= (uint16_t) (~NODE_STATE_NO_RESPOND); if (node_ptr->node_state == NODE_STATE_UNKNOWN) node_ptr->node_state = NODE_STATE_IDLE; else if ((node_ptr->node_state == NODE_STATE_DOWN) && (slurmctld_conf.ret2service == 1)) { + if (job_count) node_ptr->node_state = NODE_STATE_ALLOCATED; else diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 4e1249e3508..c0688f0d628 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -48,10 +48,13 @@ #define BUF_SIZE 1024 -static int init_slurm_conf (); +static int init_slurm_conf (void); static int parse_config_spec (char *in_line); static int parse_node_spec (char *in_line); static int parse_part_spec (char *in_line); +#ifdef HAVE_LIBELAN3 +static void validate_node_proc_count (void); +#endif static char highest_node_name[MAX_NAME_LEN] = ""; int node_record_count = 0; @@ -242,8 +245,8 @@ static int parse_config_spec (char *in_line) { int error_code; - int fast_schedule = 0, hash_base = 0, heartbeat_interval = 0, kill_wait = 0; - int ret2service = 0, slurmctld_timeout = 0, slurmd_timeout = 0; + int fast_schedule = -1, hash_base = -1, heartbeat_interval = -1, kill_wait = -1; + int ret2service = -1, slurmctld_timeout = -1, slurmd_timeout = -1; char *backup_controller = NULL, *control_machine = NULL, *epilog = NULL; char *prioritize = NULL, *prolog = NULL, *state_save_location = NULL, *tmp_fs = NULL; char *slurmctld_port = NULL, *slurmd_port = NULL; @@ -294,19 +297,19 @@ parse_config_spec (char *in_line) slurmctld_conf.epilog = epilog; } - if ( fast_schedule ) + if ( fast_schedule != -1) slurmctld_conf.fast_schedule = fast_schedule; if ( first_job_id ) slurmctld_conf.first_job_id = first_job_id; - if ( hash_base ) + if ( hash_base != -1) slurmctld_conf.hash_base = hash_base; - if ( heartbeat_interval ) + if ( heartbeat_interval != -1) slurmctld_conf.heartbeat_interval = heartbeat_interval; - if ( kill_wait ) + if ( kill_wait != -1) slurmctld_conf.kill_wait = kill_wait; if ( prioritize ) { @@ -321,7 +324,7 @@ parse_config_spec (char *in_line) slurmctld_conf.prolog = prolog; } - if ( ret2service ) + if ( ret2service != -1) slurmctld_conf.ret2service = ret2service; if ( slurmctld_port ) { @@ -333,7 +336,7 @@ parse_config_spec (char *in_line) endservent (); } - if ( slurmctld_timeout ) + if ( slurmctld_timeout != -1) slurmctld_conf.slurmctld_timeout = slurmctld_timeout; if ( slurmd_port ) { @@ -345,7 +348,7 @@ parse_config_spec (char *in_line) endservent (); } - if ( slurmd_timeout ) + if ( slurmd_timeout != -1) slurmctld_conf.slurmd_timeout = slurmd_timeout; if ( state_save_location ) { @@ -866,6 +869,9 @@ read_slurm_conf (int recover) { if ((error_code = build_bitmaps ())) return error_code; +#ifdef HAVE_LIBELAN3 + validate_node_proc_count (); +#endif if (recover) { (void) sync_nodes_to_jobs (); } @@ -923,3 +929,36 @@ sync_nodes_to_jobs (void) info ("sync_nodes_to_jobs updated state of %d nodes", update_cnt); return update_cnt; } + +#ifdef HAVE_LIBELAN3 +/* Every node in a given partition must have the same processor count at present */ +void validate_node_proc_count (void) +{ + ListIterator part_record_iterator; + struct part_record *part_record_point; + int first_bit, last_bit, i, node_size, part_size; + + part_record_iterator = list_iterator_create (part_list); + while ((part_record_point = (struct part_record *) list_next (part_record_iterator))) { + first_bit = bit_ffs (part_record_point->node_bitmap); + last_bit = bit_fls (part_record_point->node_bitmap); + part_size = -1; + for (i=first_bit; i<=last_bit; i++) { + if (bit_test (part_record_point->node_bitmap, i) == 0) + continue; + + if (slurmctld_conf.fast_schedule) + node_size = node_record_table_ptr[i].config_ptr->cpus; + else + node_size = node_record_table_ptr[i].cpus; + + if (part_size == -1) + part_size = node_size; + else if (part_size != node_size) + fatal ("Partition %s has inconsisent processor count", + part_record_point->name); + } + } + list_iterator_destroy (part_record_iterator); +} +#endif -- GitLab