From da113d70e4d7dbff21aa8023740a1aaa5cc3c423 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 14 Jul 2010 23:29:06 +0000 Subject: [PATCH] general improvement in existing gres logic --- src/common/gres.c | 202 ++++++++++++++++++++-------------------------- 1 file changed, 89 insertions(+), 113 deletions(-) diff --git a/src/common/gres.c b/src/common/gres.c index c147605b45c..350eb5992b6 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -125,12 +125,32 @@ static uint32_t _get_gres_cnt(char *orig_config, char *gres_name, char *gres_name_colon, int gres_name_colon_len); static char * _get_gres_conf(void); static uint32_t _get_tot_gres_cnt(uint32_t plugin_id, uint32_t *set_cnt); +static void _gres_job_list_delete(void *list_element); +static void _job_state_delete(void *gres_data); +static void * _job_state_dup(void *gres_data); +static int _job_state_pack(void *gres_data, Buf buffer); +static int _job_state_unpack(void **gres_data, Buf buffer, + char *gres_name); +static int _job_state_validate(char *config, void **gres_data, + slurm_gres_context_t *gres_name); static int _load_gres_plugin(char *plugin_name, slurm_gres_context_t *plugin_context); static int _log_gres_slurmd_conf(void *x, void *arg); static int _node_config_init(char *node_name, char *orig_config, slurm_gres_context_t *context_ptr, gres_state_t *gres_ptr); +static int _node_reconfig(char *node_name, char *orig_config, + char **new_config, gres_state_t *gres_ptr, + uint16_t fast_schedule, + slurm_gres_context_t *context_ptr); +static void _node_state_dealloc(void *gres_data); +static void * _node_state_dup(void *gres_data); +static void _node_state_log(void *gres_data, char *node_name, + char *gres_name); +static void _node_state_pack(void *gres_data, Buf buffer); +static int _node_state_realloc(void *job_gres_data, int node_offset, + void *node_gres_data, char *gres_name); +static int _node_state_unpack(void **gres_data, Buf buffer); static int _parse_gres_config(void **dest, slurm_parser_enum_t type, const char *key, const char *value, const char *line, char **leftover); @@ -965,7 +985,10 @@ extern int _node_config_validate(char *node_name, char *orig_config, if (updated_config == false) return SLURM_SUCCESS; - if (set_cnt != gres_data->topo_cnt) { + if ((set_cnt != gres_data->topo_cnt) || 1) { + /* Rebuild GRES information when the node registers. + * Do we want to do this for every node registration + * since it is fairly high overhead? */ for (i=0; i<gres_data->topo_cnt; i++) { FREE_NULL_BITMAP(gres_data->cpus_bitmap[i]); FREE_NULL_BITMAP(gres_data->gres_block_bitmap[i]); @@ -1005,6 +1028,7 @@ extern int _node_config_validate(char *node_name, char *orig_config, } list_iterator_destroy(iter); } + if ((orig_config == NULL) || (orig_config[0] == '\0')) gres_data->gres_cnt_config = 0; else if (gres_data->gres_cnt_config == NO_VAL) { @@ -1025,7 +1049,7 @@ extern int _node_config_validate(char *node_name, char *orig_config, if (gres_data->gres_bit_alloc == NULL) { gres_data->gres_bit_alloc = bit_alloc(gres_data->gres_cnt_avail); - } else if (gres_data->gres_cnt_avail > + } else if (gres_data->gres_cnt_avail != bit_size(gres_data->gres_bit_alloc)) { gres_data->gres_bit_alloc = bit_realloc(gres_data->gres_bit_alloc, @@ -1110,92 +1134,53 @@ extern int gres_plugin_node_config_validate(char *node_name, } static int _node_reconfig(char *node_name, char *orig_config, char **new_config, - void **gres_data, uint16_t fast_schedule, - char *gres_name) + gres_state_t *gres_ptr, uint16_t fast_schedule, + slurm_gres_context_t *context_ptr) { - char name_colon[128]; - int rc = SLURM_SUCCESS, name_colon_len; - gres_node_state_t *gres_ptr; - char *node_gres_config = NULL, *tok = NULL, *last = NULL; - int32_t gres_config_cnt = 0; + int rc = SLURM_SUCCESS; + gres_node_state_t *gres_data; - xassert(gres_data); - if (*gres_data == NULL) - *gres_data = _build_gres_node_state(); - gres_ptr = (gres_node_state_t *) *gres_data; + xassert(gres_ptr); + if (gres_ptr->gres_data == NULL) + gres_ptr->gres_data = _build_gres_node_state(); + gres_data = gres_ptr->gres_data; - name_colon_len = snprintf(name_colon, sizeof(name_colon), "%s:", - gres_name); - if (orig_config) { - node_gres_config = xstrdup(orig_config); - tok = strtok_r(node_gres_config, ",", &last); - } - while (tok) { - if (!strcmp(tok, gres_name)) { - gres_config_cnt = 1; - break; - } - if (!strncmp(tok, name_colon, name_colon_len)) { - gres_config_cnt = strtol(tok+name_colon_len, &last, 10); - if (last[0] == '\0') - ; - else if ((last[0] == 'k') || (last[0] == 'K')) - gres_config_cnt *= 1024; - break; - } - tok = strtok_r(NULL, ",", &last); - } - gres_ptr->gres_cnt_config = gres_config_cnt; - xfree(node_gres_config); + gres_data->gres_cnt_config = _get_gres_cnt(orig_config, + context_ptr->ops.gres_name, + context_ptr->gres_name_colon, + context_ptr-> + gres_name_colon_len); + if ((gres_data->gres_cnt_config == 0) || (fast_schedule > 0)) + gres_data->gres_cnt_avail = gres_data->gres_cnt_config; + else if (gres_data->gres_cnt_found != NO_VAL) + gres_data->gres_cnt_avail = gres_data->gres_cnt_found; + else if (gres_data->gres_cnt_avail == NO_VAL) + gres_data->gres_cnt_avail = 0; - if ((gres_ptr->gres_cnt_config == 0) || (fast_schedule > 0)) - gres_ptr->gres_cnt_avail = gres_ptr->gres_cnt_config; - else if (gres_ptr->gres_cnt_found != NO_VAL) - gres_ptr->gres_cnt_avail = gres_ptr->gres_cnt_found; - else if (gres_ptr->gres_cnt_avail == NO_VAL) - gres_ptr->gres_cnt_avail = 0; - - if (gres_ptr->gres_bit_alloc == NULL) { - gres_ptr->gres_bit_alloc = bit_alloc(gres_ptr->gres_cnt_avail); - } else if (gres_ptr->gres_cnt_avail > - bit_size(gres_ptr->gres_bit_alloc)) { - gres_ptr->gres_bit_alloc = bit_realloc(gres_ptr->gres_bit_alloc, - gres_ptr->gres_cnt_avail); - } - if (gres_ptr->gres_bit_alloc == NULL) + if (gres_data->gres_bit_alloc == NULL) { + gres_data->gres_bit_alloc = bit_alloc(gres_data->gres_cnt_avail); + } else if (gres_data->gres_cnt_avail != + bit_size(gres_data->gres_bit_alloc)) { + gres_data->gres_bit_alloc = bit_realloc(gres_data->gres_bit_alloc, + gres_data->gres_cnt_avail); + } + if (gres_data->gres_bit_alloc == NULL) fatal("bit_alloc: malloc failure"); if ((fast_schedule < 2) && - (gres_ptr->gres_cnt_found != NO_VAL) && - (gres_ptr->gres_cnt_found < gres_ptr->gres_cnt_config)) { + (gres_data->gres_cnt_found != NO_VAL) && + (gres_data->gres_cnt_found < gres_data->gres_cnt_config)) { /* Do not set node DOWN, but give the node * a chance to register with more resources */ - gres_ptr->gres_cnt_found = NO_VAL; + gres_data->gres_cnt_found = NO_VAL; } else if ((fast_schedule == 0) && - (gres_ptr->gres_cnt_found != NO_VAL) && - (gres_ptr->gres_cnt_found > gres_ptr->gres_cnt_config)) { - /* need to rebuild new_config */ - char *new_configured_res = NULL; - if (*new_config) - node_gres_config = xstrdup(*new_config); - else - node_gres_config = xstrdup(orig_config); - tok = strtok_r(node_gres_config, ",", &last); - while (tok) { - if (new_configured_res) - xstrcat(new_configured_res, ","); - if (strcmp(tok, gres_name) && - strncmp(tok, name_colon, name_colon_len)) { - xstrcat(new_configured_res, tok); - } else { - xstrfmtcat(new_configured_res, "%s:%u", - name_colon, gres_ptr->gres_cnt_found); - } - tok = strtok_r(NULL, ",", &last); - } - xfree(node_gres_config); - xfree(*new_config); - *new_config = new_configured_res; + (gres_data->gres_cnt_found != NO_VAL) && + (gres_data->gres_cnt_found > gres_data->gres_cnt_config)) { + _set_gres_cnt(orig_config, new_config, + gres_data->gres_cnt_found, + context_ptr->ops.gres_name, + context_ptr->gres_name_colon, + context_ptr->gres_name_colon_len); } return rc; @@ -1241,23 +1226,20 @@ extern int gres_plugin_node_reconfig(char *node_name, continue; rc = _node_reconfig(node_name, orig_config, new_config, - &gres_ptr->gres_data, fast_schedule, - gres_context[i].ops.gres_name); + gres_ptr, fast_schedule, &gres_context[i]); } slurm_mutex_unlock(&gres_context_lock); return rc; } -static int _node_state_pack(void *gres_data, Buf buffer) +static void _node_state_pack(void *gres_data, Buf buffer) { gres_node_state_t *gres_ptr = (gres_node_state_t *) gres_data; pack32(gres_ptr->gres_cnt_avail, buffer); pack32(gres_ptr->gres_cnt_alloc, buffer); pack_bit_str(gres_ptr->gres_bit_alloc, buffer); - - return SLURM_SUCCESS; } static int _node_state_unpack(void **gres_data, Buf buffer) @@ -1307,7 +1289,7 @@ unpack_error: extern int gres_plugin_node_state_pack(List gres_list, Buf buffer, char *node_name) { - int i, rc = SLURM_SUCCESS, rc2; + int i, rc = SLURM_SUCCESS; uint32_t top_offset, gres_size = 0; uint32_t header_offset, size_offset, data_offset, tail_offset; uint32_t magic = GRES_MAGIC; @@ -1341,12 +1323,7 @@ extern int gres_plugin_node_state_pack(List gres_list, Buf buffer, size_offset = get_buf_offset(buffer); pack32(gres_size, buffer); /* placeholder */ data_offset = get_buf_offset(buffer); - rc2 = _node_state_pack(gres_ptr->gres_data, buffer); - if (rc2 != SLURM_SUCCESS) { - rc = rc2; - set_buf_offset(buffer, header_offset); - break; - } + _node_state_pack(gres_ptr->gres_data, buffer); tail_offset = get_buf_offset(buffer); set_buf_offset(buffer, size_offset); gres_size = tail_offset - data_offset; @@ -1474,6 +1451,7 @@ unpack_error: static void *_node_state_dup(void *gres_data) { + int i; gres_node_state_t *gres_ptr = (gres_node_state_t *) gres_data; gres_node_state_t *new_gres; @@ -1487,6 +1465,17 @@ static void *_node_state_dup(void *gres_data) new_gres->gres_cnt_alloc = gres_ptr->gres_cnt_alloc; new_gres->gres_bit_alloc = bit_copy(gres_ptr->gres_bit_alloc); + new_gres->topo_cnt = gres_ptr->topo_cnt; + new_gres->cpus_bitmap = xmalloc(gres_ptr->topo_cnt * + sizeof(bitstr_t *)); + new_gres->gres_block_bitmap = xmalloc(gres_ptr->topo_cnt * + sizeof(bitstr_t *)); + for (i=0; i<gres_ptr->topo_cnt; i++) { + new_gres->cpus_bitmap[i] = bit_copy(gres_ptr->cpus_bitmap[i]); + new_gres->gres_block_bitmap[i] = bit_copy(gres_ptr-> + gres_block_bitmap[i]); + } + return new_gres; } @@ -1560,7 +1549,6 @@ static void _node_state_dealloc(void *gres_data) */ extern void gres_plugin_node_state_dealloc(List gres_list) { - int i; ListIterator gres_iter; gres_state_t *gres_ptr; @@ -1572,13 +1560,7 @@ extern void gres_plugin_node_state_dealloc(List gres_list) slurm_mutex_lock(&gres_context_lock); gres_iter = list_iterator_create(gres_list); while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) { - for (i=0; i<gres_context_cnt; i++) { - if (gres_ptr->plugin_id != - *(gres_context[i].ops.plugin_id)) - continue; - _node_state_dealloc(gres_ptr->gres_data); - break; - } + _node_state_dealloc(gres_ptr->gres_data); } list_iterator_destroy(gres_iter); slurm_mutex_unlock(&gres_context_lock); @@ -1787,7 +1769,6 @@ static void _job_state_delete(void *gres_data) static void _gres_job_list_delete(void *list_element) { - int i; gres_state_t *gres_ptr; if (gres_plugin_init() != SLURM_SUCCESS) @@ -1795,31 +1776,26 @@ static void _gres_job_list_delete(void *list_element) gres_ptr = (gres_state_t *) list_element; slurm_mutex_lock(&gres_context_lock); - for (i=0; i<gres_context_cnt; i++) { - if (gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) - continue; - _job_state_delete(gres_ptr->gres_data); - xfree(gres_ptr); - break; - } + _job_state_delete(gres_ptr->gres_data); + xfree(gres_ptr); slurm_mutex_unlock(&gres_context_lock); } -static int _job_state_validate(char *config, void **gres_data, char *gres_name) +static int _job_state_validate(char *config, void **gres_data, + slurm_gres_context_t *context_ptr) { char *last = NULL; - char name_colon[128]; - int name_colon_len; + char *gres_name = context_ptr->ops.gres_name; + char *name_colon = context_ptr->gres_name_colon; + int name_colon_len = context_ptr->gres_name_colon_len; gres_job_state_t *gres_ptr; uint32_t cnt; uint8_t mult = 0; - name_colon_len = snprintf(name_colon, sizeof(name_colon), "%s:", - gres_name); if (!strcmp(config, gres_name)) { cnt = 1; } else if (!strncmp(config, name_colon, name_colon_len)) { - cnt = strtol(config+name_colon_len, &last, 10); + cnt = strtol(config + name_colon_len, &last, 10); if (last[0] == '\0') ; else if ((last[0] == 'k') || (last[0] == 'K')) @@ -1874,7 +1850,7 @@ extern int gres_plugin_job_state_validate(char *req_config, List *gres_list) rc2 = SLURM_ERROR; for (i=0; i<gres_context_cnt; i++) { rc2 = _job_state_validate(tok, &gres_data, - gres_context[i].ops.gres_name); + &gres_context[i]); if (rc2 != SLURM_SUCCESS) continue; gres_ptr = xmalloc(sizeof(gres_state_t)); -- GitLab