From a6fad4035879d90ab7746035e257917e6cef3aae Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 30 Jul 2010 19:16:27 +0000 Subject: [PATCH] improve logic for node's gres state save/restore/deallocate. let job/step specify gres requirement with suffix of M (mega) --- src/common/gres.c | 200 ++++++++++++++++++---------------------------- 1 file changed, 79 insertions(+), 121 deletions(-) diff --git a/src/common/gres.c b/src/common/gres.c index 04a756cf5e7..70da277e31a 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -54,14 +54,17 @@ # include <stdint.h> # endif # endif /* HAVE_INTTYPES_H */ +# ifdef HAVE_LIMITS_H +# include <limits.h> +# endif #else /* ! HAVE_CONFIG_H */ +# include <limits.h> # include <sys/types.h> # include <stdint.h> # include <stdlib.h> # include <string.h> #endif /* HAVE_CONFIG_H */ -#include <limits.h> #include <stdio.h> #include <stdlib.h> #include <slurm/slurm.h> @@ -151,15 +154,12 @@ static int _node_reconfig(char *node_name, char *orig_config, char **new_config, gres_state_t *gres_ptr, uint16_t fast_schedule, slurm_gres_context_t *context_ptr); -static void _node_state_dealloc(void *gres_data); +static void _node_state_dealloc(gres_state_t *gres_ptr); static void * _node_state_dup(void *gres_data); static void _node_state_log(void *gres_data, char *node_name, char *gres_name); -static void _node_state_pack(void *gres_data, Buf buffer); static int _node_state_realloc(void *job_gres_data, int node_offset, void *node_gres_data, char *gres_name); -static int _node_state_unpack(void **gres_data, Buf buffer, - bool has_file); static int _parse_gres_config(void **dest, slurm_parser_enum_t type, const char *key, const char *value, const char *line, char **leftover); @@ -633,8 +633,9 @@ static int _parse_gres_config(void **dest, slurm_parser_enum_t type, fatal("bit_alloc: malloc failure"); i = bit_unfmt(cpu_bitmap, p->cpus); if (i != 0) { - fatal("Invalid gres data for %s, CPUs=%s", - p->name, p->cpus); + fatal("Invalid gres data for %s, CPUs=%s (only %u CPUs" + " are available)", + p->name, p->cpus, gres_cpu_cnt); } FREE_NULL_BITMAP(cpu_bitmap); } @@ -1486,55 +1487,6 @@ extern int gres_plugin_node_reconfig(char *node_name, return rc; } -static void _node_state_pack(void *gres_data, Buf buffer) -{ - gres_node_state_t *gres_ptr = (gres_node_state_t *) gres_data; - - pack32(gres_ptr->gres_cnt_avail, buffer); - pack32(gres_ptr->gres_cnt_alloc, buffer); - pack_bit_str(gres_ptr->gres_bit_alloc, buffer); -} - -static int _node_state_unpack(void **gres_data, Buf buffer, bool has_file) -{ - gres_node_state_t *gres_ptr; - - gres_ptr = xmalloc(sizeof(gres_node_state_t)); - - gres_ptr->gres_cnt_found = NO_VAL; - if (buffer) { - safe_unpack32(&gres_ptr->gres_cnt_avail, buffer); - safe_unpack32(&gres_ptr->gres_cnt_alloc, buffer); - unpack_bit_str(&gres_ptr->gres_bit_alloc, buffer); - if (!has_file) - FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc); - if ((gres_ptr->gres_bit_alloc != NULL) && - (gres_ptr->gres_cnt_avail != - bit_size(gres_ptr->gres_bit_alloc))) { - gres_ptr->gres_bit_alloc = - bit_realloc(gres_ptr->gres_bit_alloc, - gres_ptr->gres_cnt_avail); - if (gres_ptr->gres_bit_alloc == NULL) - goto unpack_error; - } - if ((gres_ptr->gres_bit_alloc != NULL) && - (gres_ptr->gres_cnt_alloc != - bit_set_count(gres_ptr->gres_bit_alloc))) { - error("gres _node_state_unpack bit count inconsistent"); - goto unpack_error; - } - } - - *gres_data = gres_ptr; - return SLURM_SUCCESS; - -unpack_error: - FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc); - xfree(gres_ptr); - *gres_data = NULL; - return SLURM_ERROR; -} - /* * Pack a node's current gres status, called from slurmctld for save/restore * IN gres_list - generated by gres_plugin_node_config_validate() @@ -1544,13 +1496,14 @@ unpack_error: extern int gres_plugin_node_state_pack(List gres_list, Buf buffer, char *node_name) { - int i, rc = SLURM_SUCCESS; - uint32_t top_offset, gres_size = 0; - uint32_t header_offset, size_offset, data_offset, tail_offset; + int rc = SLURM_SUCCESS; + uint32_t top_offset, tail_offset; uint32_t magic = GRES_MAGIC; uint16_t rec_cnt = 0; + uint8_t has_bitmap; ListIterator gres_iter; gres_state_t *gres_ptr; + gres_node_state_t *gres_node_ptr; if (gres_list == NULL) { pack16(rec_cnt, buffer); @@ -1568,29 +1521,19 @@ extern int gres_plugin_node_state_pack(List gres_list, Buf buffer, slurm_mutex_lock(&gres_context_lock); gres_iter = list_iterator_create(gres_list); while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) { - for (i=0; i<gres_context_cnt; i++) { - if (gres_ptr->plugin_id != gres_context[i].plugin_id) - continue; - header_offset = get_buf_offset(buffer); - pack32(magic, buffer); - pack32(gres_ptr->plugin_id, buffer); - size_offset = get_buf_offset(buffer); - pack32(gres_size, buffer); /* placeholder */ - data_offset = get_buf_offset(buffer); - _node_state_pack(gres_ptr->gres_data, buffer); - tail_offset = get_buf_offset(buffer); - set_buf_offset(buffer, size_offset); - gres_size = tail_offset - data_offset; - pack32(gres_size, buffer); - set_buf_offset(buffer, tail_offset); - rec_cnt++; - break; - } - if (i >= gres_context_cnt) { - error("Could not find plugin id %u to pack record for " - "node %s", - gres_ptr->plugin_id, node_name); - } + gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data; + pack32(magic, buffer); + pack32(gres_ptr->plugin_id, buffer); + pack32(gres_node_ptr->gres_cnt_avail, buffer); + /* Just note if gres_bit_alloc exists. + * Rebuild it based upon the state of recovered jobs */ + if (gres_node_ptr->gres_bit_alloc) + has_bitmap = 1; + else + has_bitmap = 0; + pack8(has_bitmap, buffer); + rec_cnt++; + break; } list_iterator_destroy(gres_iter); slurm_mutex_unlock(&gres_context_lock); @@ -1612,11 +1555,12 @@ extern int gres_plugin_node_state_pack(List gres_list, Buf buffer, extern int gres_plugin_node_state_unpack(List *gres_list, Buf buffer, char *node_name) { - int i, rc, rc2; - uint32_t gres_size, magic, tail_offset, plugin_id; + int i, rc; + uint32_t gres_cnt_avail, magic, plugin_id; uint16_t rec_cnt; + uint8_t has_bitmap; gres_state_t *gres_ptr; - void *gres_data; + gres_node_state_t *gres_node_ptr; safe_unpack16(&rec_cnt, buffer); if (rec_cnt == 0) @@ -1642,7 +1586,8 @@ extern int gres_plugin_node_state_unpack(List *gres_list, Buf buffer, if (magic != GRES_MAGIC) goto unpack_error; safe_unpack32(&plugin_id, buffer); - safe_unpack32(&gres_size, buffer); + safe_unpack32(&gres_cnt_avail, buffer); + safe_unpack8(&has_bitmap, buffer); for (i=0; i<gres_context_cnt; i++) { if (gres_context[i].plugin_id == plugin_id) break; @@ -1653,25 +1598,21 @@ extern int gres_plugin_node_state_unpack(List *gres_list, Buf buffer, plugin_id, node_name); /* A likely sign that GresPlugins has changed. * Not a fatal error, skip over the data. */ - tail_offset = get_buf_offset(buffer); - tail_offset += gres_size; - set_buf_offset(buffer, tail_offset); continue; } gres_context[i].unpacked_info = true; - rc2 = _node_state_unpack(&gres_data, buffer, - gres_context[i].has_file); - if (rc2 != SLURM_SUCCESS) { - error("gres_plugin_node_state_unpack: error unpacking " - "data of type %s from node %s", - gres_context[i].gres_name, node_name); - rc = rc2; - } else { - gres_ptr = xmalloc(sizeof(gres_state_t)); - gres_ptr->plugin_id = gres_context[i].plugin_id; - gres_ptr->gres_data = gres_data; - list_append(*gres_list, gres_ptr); + gres_node_ptr = _build_gres_node_state(); + gres_node_ptr->gres_cnt_avail = gres_cnt_avail; + if (has_bitmap) { + gres_node_ptr->gres_bit_alloc = + bit_alloc(gres_cnt_avail); + if (gres_node_ptr->gres_bit_alloc == NULL) + fatal("bit_alloc: malloc failure"); } + gres_ptr = xmalloc(sizeof(gres_state_t)); + gres_ptr->plugin_id = gres_context[i].plugin_id; + gres_ptr->gres_data = gres_node_ptr; + list_append(*gres_list, gres_ptr); } fini: /* Insure that every gres plugin is called for unpack, even if no data @@ -1681,18 +1622,12 @@ fini: /* Insure that every gres plugin is called for unpack, even if no data if (gres_context[i].unpacked_info) continue; error("gres_plugin_node_state_unpack: no info packed for %s " - "by node %s", - gres_context[i].gres_type, node_name); - rc2 = _node_state_unpack(&gres_data, NULL, - gres_context[i].has_file); - if (rc2 != SLURM_SUCCESS) { - rc = rc2; - } else { - gres_ptr = xmalloc(sizeof(gres_state_t)); - gres_ptr->plugin_id = gres_context[i].plugin_id; - gres_ptr->gres_data = gres_data; - list_append(*gres_list, gres_ptr); - } + "by node %s", gres_context[i].gres_type, node_name); + gres_node_ptr = _build_gres_node_state(); + gres_ptr = xmalloc(sizeof(gres_state_t)); + gres_ptr->plugin_id = gres_context[i].plugin_id; + gres_ptr->gres_data = gres_node_ptr; + list_append(*gres_list, gres_ptr); } slurm_mutex_unlock(&gres_context_lock); @@ -1721,6 +1656,8 @@ static void *_node_state_dup(void *gres_data) new_gres->gres_cnt_alloc = gres_ptr->gres_cnt_alloc; if (gres_ptr->gres_bit_alloc) new_gres->gres_bit_alloc = bit_copy(gres_ptr->gres_bit_alloc); + if (gres_ptr->topo_cnt == 0) + return new_gres; new_gres->topo_cnt = gres_ptr->topo_cnt; new_gres->topo_cpus_bitmap = xmalloc(gres_ptr->topo_cnt * @@ -1736,12 +1673,14 @@ static void *_node_state_dup(void *gres_data) bit_copy(gres_ptr->topo_cpus_bitmap[i]); new_gres->topo_gres_bitmap[i] = bit_copy(gres_ptr->topo_gres_bitmap[i]); + if ((new_gres->topo_cpus_bitmap[i] == NULL) || + (new_gres->topo_gres_bitmap[i] == NULL)) + fatal("bit_copy: malloc failure"); new_gres->topo_gres_cnt_alloc[i] = gres_ptr->topo_gres_cnt_alloc[i]; new_gres->topo_gres_cnt_avail[i] = gres_ptr->topo_gres_cnt_avail[i]; } - return new_gres; } @@ -1794,15 +1733,32 @@ extern List gres_plugin_node_state_dup(List gres_list) return new_list; } -static void _node_state_dealloc(void *gres_data) +static void _node_state_dealloc(gres_state_t *gres_ptr) { - gres_node_state_t *gres_ptr = (gres_node_state_t *) gres_data; + int i; + gres_node_state_t *gres_node_ptr; + char *gres_name = NULL; - gres_ptr->gres_cnt_alloc = 0; - if (gres_ptr->gres_bit_alloc) { - int i = bit_size(gres_ptr->gres_bit_alloc) - 1; + gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data; + gres_node_ptr->gres_cnt_alloc = 0; + if (gres_node_ptr->gres_bit_alloc) { + int i = bit_size(gres_node_ptr->gres_bit_alloc) - 1; if (i > 0) - bit_nclear(gres_ptr->gres_bit_alloc, 0, i); + bit_nclear(gres_node_ptr->gres_bit_alloc, 0, i); + } + if (gres_node_ptr->topo_cnt && !gres_node_ptr->topo_gres_cnt_alloc) { + for (i=0; i<gres_context_cnt; i++) { + if (gres_ptr->plugin_id == gres_context[i].plugin_id) { + gres_name = gres_context[i].gres_name; + break; + } + } + error("gres_plugin_node_state_dealloc: gres/%s topo_cnt!=0 " + "and topo_gres_cnt_alloc is NULL", gres_name); + } else { + for (i=0; i<gres_node_ptr->topo_cnt; i++) { + gres_node_ptr->topo_gres_cnt_alloc[i] = 0; + } } } @@ -1825,7 +1781,7 @@ extern void gres_plugin_node_state_dealloc(List gres_list) slurm_mutex_lock(&gres_context_lock); gres_iter = list_iterator_create(gres_list); while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) { - _node_state_dealloc(gres_ptr->gres_data); + _node_state_dealloc(gres_ptr); } list_iterator_destroy(gres_iter); slurm_mutex_unlock(&gres_context_lock); @@ -2070,6 +2026,8 @@ static int _job_config_validate(char *config, uint32_t *gres_cnt, ; else if ((last_num[0] == 'k') || (last_num[0] == 'K')) cnt *= 1024; + else if ((last_num[0] == 'm') || (last_num[0] == 'M')) + cnt *= (1024 * 1024); else return SLURM_ERROR; if (cnt <= 0) -- GitLab