From c18e876d4bca870cdc101080a41a9cff86987f4c Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 19 Apr 2010 19:07:49 +0000 Subject: [PATCH] Fix some problems with save/restore of node state (features and gres) --- src/common/node_conf.c | 8 ++- src/slurmctld/node_mgr.c | 130 +++++++++++++++++++++++------------- src/slurmctld/read_config.c | 6 +- src/slurmctld/slurmctld.h | 11 +-- 4 files changed, 100 insertions(+), 55 deletions(-) diff --git a/src/common/node_conf.c b/src/common/node_conf.c index 6c970ece1e9..be681e122ca 100644 --- a/src/common/node_conf.c +++ b/src/common/node_conf.c @@ -214,9 +214,11 @@ static int _build_single_nodeline_info(slurm_conf_node_t *node_ptr, node_rec->node_state = state_val; node_rec->last_response = (time_t) 0; node_rec->comm_name = xstrdup(address); - - node_rec->port = node_ptr->port; - node_rec->reason = xstrdup(node_ptr->reason); + node_rec->port = node_ptr->port; + node_rec->weight = node_ptr->weight; + node_rec->features = xstrdup(node_ptr->feature); + node_rec->gres = xstrdup(node_ptr->gres); + node_rec->reason = xstrdup(node_ptr->reason); } else { /* FIXME - maybe should be fatal? */ error("reconfiguration for node %s, ignoring!", alias); diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index a083a26af1d..50a9427069a 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -205,7 +205,8 @@ _dump_node_state (struct node_record *dump_node_ptr, Buf buffer) { packstr (dump_node_ptr->name, buffer); packstr (dump_node_ptr->reason, buffer); - packstr (dump_node_ptr->config_ptr->feature, buffer); + packstr (dump_node_ptr->features, buffer); + packstr (dump_node_ptr->gres, buffer); pack16 (dump_node_ptr->node_state, buffer); pack16 (dump_node_ptr->cpus, buffer); pack16 (dump_node_ptr->sockets, buffer); @@ -251,14 +252,15 @@ static int _open_node_state_file(char **state_file) * load_all_node_state - Load the node state from file, recover on slurmctld * restart. Execute this after loading the configuration file data. * Data goes into common storage. - * IN state_only - if true, overwrite only node state, features and reason + * IN state_only - if true, overwrite only node state and reason * Use this to overwrite the "UNKNOWN state typically used in slurm.conf * RET 0 or error code * NOTE: READ lock_slurmctld config before entry */ extern int load_all_node_state ( bool state_only ) { - char *node_name, *reason = NULL, *data = NULL, *state_file, *features; + char *node_name, *reason = NULL, *data = NULL, *state_file; + char *features, *gres; int data_allocated, data_read = 0, error_code = 0, node_cnt = 0; uint16_t node_state; uint16_t cpus = 1, sockets = 1, cores = 1, threads = 1; @@ -338,6 +340,7 @@ extern int load_all_node_state ( bool state_only ) safe_unpackstr_xmalloc (&node_name, &name_len, buffer); safe_unpackstr_xmalloc (&reason, &name_len, buffer); safe_unpackstr_xmalloc (&features, &name_len, buffer); + safe_unpackstr_xmalloc (&gres, &name_len, buffer); safe_unpack16 (&node_state, buffer); safe_unpack16 (&cpus, buffer); safe_unpack16 (&sockets, buffer); @@ -388,6 +391,7 @@ extern int load_all_node_state ( bool state_only ) error ("Node %s has vanished from configuration", node_name); xfree(features); + xfree(gres); xfree(reason); } else if (state_only) { uint16_t orig_flags; @@ -429,8 +433,8 @@ extern int load_all_node_state ( bool state_only ) node_ptr->reason_uid = reason_uid; } else xfree(reason); - xfree(node_ptr->features); - node_ptr->features = features; + xfree(features); + xfree(gres); } else { node_cnt++; if ((!power_save_mode) && @@ -450,6 +454,8 @@ extern int load_all_node_state ( bool state_only ) node_ptr->reason_uid = reason_uid; xfree(node_ptr->features); node_ptr->features = features; + xfree(node_ptr->gres); + node_ptr->gres = gres; node_ptr->part_cnt = 0; xfree(node_ptr->part_pptr); node_ptr->cpus = cpus; @@ -463,10 +469,11 @@ extern int load_all_node_state ( bool state_only ) } xfree (node_name); - if(node_ptr) + if (node_ptr) { select_g_update_node_state( (node_ptr - node_record_table_ptr), node_ptr->node_state); + } } fini: info("Recovered state of %d nodes", node_cnt); @@ -643,8 +650,8 @@ static void _pack_node (struct node_record *dump_node_ptr, Buf buffer, buffer, protocol_version); packstr(dump_node_ptr->arch, buffer); - packstr(dump_node_ptr->config_ptr->feature, buffer); - packstr(dump_node_ptr->config_ptr->gres, buffer); + packstr(dump_node_ptr->features, buffer); + packstr(dump_node_ptr->gres, buffer); packstr(dump_node_ptr->os, buffer); packstr(dump_node_ptr->reason, buffer); } else if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { @@ -673,7 +680,7 @@ static void _pack_node (struct node_record *dump_node_ptr, Buf buffer, buffer, protocol_version); packstr(dump_node_ptr->arch, buffer); - packstr(dump_node_ptr->config_ptr->feature, buffer); + packstr(dump_node_ptr->features, buffer); packstr(dump_node_ptr->os, buffer); packstr(dump_node_ptr->reason, buffer); } else { @@ -758,6 +765,21 @@ int update_node ( update_node_msg_t * update_node_msg ) break; } + if (update_node_msg->features) { + xfree(node_ptr->features); + if (update_node_msg->features[0]) + node_ptr->features = xstrdup(update_node_msg-> + features); + /* _update_node_features() logs and udates config */ + } + + if (update_node_msg->gres) { + xfree(node_ptr->gres); + if (update_node_msg->gres[0]) + node_ptr->gres = xstrdup(update_node_msg->gres); + /* _update_node_gres() logs and udates config */ + } + if ((update_node_msg -> reason) && (update_node_msg -> reason[0])) { xfree(node_ptr->reason); @@ -948,55 +970,71 @@ int update_node ( update_node_msg_t * update_node_msg ) return error_code; } +/* variation of strcmp that accepts NULL pointers */ +static int _strcmp(char *str1, char *str2) +{ + if (!str1 && !str2) + return 0; + if (str1 && !str2) + return 1; + if (!str1 && str2) + return -1; + return strcmp(str1, str2); +} + /* - * restore_node_features - Restore node Features and Weight based upon state - * previously in memory (preserves interactive updates) + * restore_node_features - Make node and config (from slurm.conf) fields + * consistent for Features, Gres and Weight + * IN recover - + * 0, 1 - use data from config record, built using slurm.conf + * 2 = use data from node record, built from saved state */ -extern void restore_node_features(void) +extern void restore_node_features(int recover) { - int i, j; - char *node_list; - struct node_record *node_ptr1, *node_ptr2; - hostlist_t hl; + int i; + struct node_record *node_ptr; - for (i=0, node_ptr1=node_record_table_ptr; i<node_record_count; - i++, node_ptr1++) { + for (i=0, node_ptr=node_record_table_ptr; i<node_record_count; + i++, node_ptr++) { - if (node_ptr1->weight != node_ptr1->config_ptr->weight) { + if (node_ptr->weight != node_ptr->config_ptr->weight) { error("Node %s Weight(%u) differ from slurm.conf", - node_ptr1->name, node_ptr1->weight); - _update_node_weight(node_ptr1->name, - node_ptr1->weight); + node_ptr->name, node_ptr->weight); + if (recover == 2) { + _update_node_weight(node_ptr->name, + node_ptr->weight); + } else { + node_ptr->weight = node_ptr->config_ptr-> + weight; + } } - if (!node_ptr1->config_ptr->feature && !node_ptr1->features) - continue; /* No feature to preserve */ - if (node_ptr1->config_ptr->feature && node_ptr1->features && - !strcmp(node_ptr1->config_ptr->feature, - node_ptr1->features)) { - continue; /* Identical feature value */ + if (_strcmp(node_ptr->config_ptr->feature, node_ptr->features)){ + error("Node %s Features(%s) differ from slurm.conf", + node_ptr->name, node_ptr->features); + if (recover == 2) { + _update_node_features(node_ptr->name, + node_ptr->features); + } else { + xfree(node_ptr->features); + node_ptr->features = xstrdup(node_ptr-> + config_ptr-> + feature); + } } - hl = hostlist_create(node_ptr1->name); - for (j=(i+1), node_ptr2=(node_ptr1+1); j<node_record_count; - j++, node_ptr2++) { - if ((!node_ptr1->features && !node_ptr2->features) || - ( node_ptr1->features && node_ptr2->features && - !strcmp(node_ptr1->features, - node_ptr2->features))) { - /* Reset this job's feature at same time */ - hostlist_push(hl, node_ptr2->name); + if (_strcmp(node_ptr->config_ptr->gres, node_ptr->gres)) { + error("Node %s Gres(%s) differ from slurm.conf", + node_ptr->name, node_ptr->gres); + if (recover == 2) { + _update_node_gres(node_ptr->name, + node_ptr->gres); + } else { + xfree(node_ptr->gres); + node_ptr->gres = xstrdup(node_ptr->config_ptr-> + gres); } } - - node_list = xmalloc(2048); - hostlist_ranged_string(hl, 2048, node_list); - error("Node %s Features(%s) differ from slurm.conf", - node_list, node_ptr1->features); - _update_node_features(node_list, node_ptr1->features); - xfree(node_ptr1->features); - xfree(node_list); - hostlist_destroy(hl); } } diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 49bbd44480f..20fa0bc1d0a 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -760,13 +760,12 @@ int read_slurm_conf(int recover, bool reconfig) if (reconfig) { load_all_resv_state(0); - if (recover == 2) - restore_node_features(); } else { load_all_resv_state(recover); if (recover >= 1) (void) trigger_state_restore(); } + restore_node_features(recover); /* sort config_list by weight for scheduling */ list_sort(config_list, &list_compare_config); @@ -886,6 +885,9 @@ static int _restore_node_state(int recover, xfree(node_ptr->features); node_ptr->features = old_node_ptr->features; old_node_ptr->features = NULL; + xfree(node_ptr->gres); + node_ptr->gres = old_node_ptr->gres; + old_node_ptr->gres = NULL; } if (old_node_ptr->arch) { xfree(node_ptr->arch); diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index f650653326e..f1f352aa01b 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -1115,7 +1115,7 @@ extern int load_all_job_state ( void ); * load_all_node_state - Load the node state from file, recover on slurmctld * restart. Execute this after loading the configuration file data. * Data goes into common storage. - * IN state_only - if true over-write only node state, features and reason + * IN state_only - if true over-write only node state, features, gres and reason * RET 0 or error code */ extern int load_all_node_state ( bool state_only ); @@ -1351,10 +1351,13 @@ extern void reset_job_bitmaps (void); extern void reset_job_priority(void); /* - * restore_node_features - Restore node features based upon state - * saved (preserves interactive updates) + * restore_node_features - Make node and config (from slurm.conf) fields + * consistent for Features, Gres and Weight + * IN recover - + * 0, 1 - use data from config record, built using slurm.conf + * 2 = use data from node record, built from saved state */ -extern void restore_node_features(void); +extern void restore_node_features(int recover); /* Update time stamps for job step resume */ extern void resume_job_step(struct job_record *job_ptr); -- GitLab