From c17e3093ee84ae76ab59b5aede890dc23b252de2 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 8 Jul 2010 00:04:31 +0000 Subject: [PATCH] Add logic to read and load state based upon gres.conf file --- src/common/Makefile.am | 1 + src/common/Makefile.in | 2 +- src/common/gres.c | 116 ++++++++++++++++++++++++++++++- src/common/gres.h | 7 ++ src/plugins/gres/gpu/gres_gpu.c | 117 +++++++------------------------- src/plugins/gres/nic/gres_nic.c | 117 +++++++------------------------- 6 files changed, 170 insertions(+), 190 deletions(-) diff --git a/src/common/Makefile.am b/src/common/Makefile.am index 8822210e4d3..6e4b8f28213 100644 --- a/src/common/Makefile.am +++ b/src/common/Makefile.am @@ -8,6 +8,7 @@ # only be 1 address per symbol. If you link to the libcommon.la in # a plugin you will get 2 addresses for one symbol which could lead to problems. +CPPFLAGS = -DGRES_CONFIG_FILE=\"$(sysconfdir)/gres.conf\" AUTOMAKE_OPTIONS = foreign if HAVE_UNSETENV diff --git a/src/common/Makefile.in b/src/common/Makefile.in index 6d9a79176a1..0e0bae8dc5c 100644 --- a/src/common/Makefile.in +++ b/src/common/Makefile.in @@ -218,7 +218,7 @@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ CMD_LDFLAGS = @CMD_LDFLAGS@ CPP = @CPP@ -CPPFLAGS = @CPPFLAGS@ +CPPFLAGS = -DGRES_CONFIG_FILE=\"$(sysconfdir)/gres.conf\" CXX = @CXX@ CXXCPP = @CXXCPP@ CXXDEPMODE = @CXXDEPMODE@ diff --git a/src/common/gres.c b/src/common/gres.c index b26637efdcb..b0b96a25a95 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -62,12 +62,16 @@ #endif /* HAVE_CONFIG_H */ #include <stdio.h> +#include <stdlib.h> #include <slurm/slurm.h> #include <slurm/slurm_errno.h> +#include <sys/stat.h> +#include "src/common/gres.h" #include "src/common/list.h" #include "src/common/macros.h" #include "src/common/pack.h" +#include "src/common/parse_config.h" #include "src/common/plugin.h" #include "src/common/plugrack.h" #include "src/common/slurm_protocol_api.h" @@ -82,7 +86,7 @@ typedef struct slurm_gres_ops { int (*node_config_init) ( char *node_name, char *orig_config, void **gres_data ); - int (*node_config_load) ( void ); + int (*node_config_load) ( List gres_conf_list ); int (*node_config_pack) ( Buf buffer ); int (*node_config_unpack) ( Buf buffer ); int (*node_config_validate) ( char *node_name, @@ -486,21 +490,127 @@ extern int gres_plugin_reconfig(bool *did_change) return rc; } +static char *_get_gres_conf(void) +{ + char *val = getenv("SLURM_CONF"); + char *rc = NULL; + int i; + + if (!val) + return xstrdup(GRES_CONFIG_FILE); + + /* Replace file name on end of path */ + i = strlen(val) - strlen("slurm.conf") + strlen("gres.conf") + 1; + rc = xmalloc(i); + strcpy(rc, val); + val = strrchr(rc, (int)'/'); + if (val) /* absolute path */ + val++; + else /* not absolute path */ + val = rc; + strcpy(val, "gres.conf"); + return rc; +} + +static void _destroy_gres_conf(void *x) +{ + gres_conf_t *p = (gres_conf_t *) x; + + xassert(p); + xfree(p->cpus); + xfree(p->file); + xfree(p->name); + xfree(p); +} + +static int _log_gres_conf(void *x, void *arg) +{ + gres_conf_t *p; + + if (!gres_debug) + return 0; + + p = (gres_conf_t *) x; + xassert(p); + info("Gres Name:%s File:%s CPUs:%s Count:%u", + p->name, p->file, p->cpus, p->count); + return 0; +} + +static int _parse_gres_config(void **dest, slurm_parser_enum_t type, + const char *key, const char *value, + const char *line, char **leftover) +{ + static s_p_options_t _gres_options[] = { + {"Count", S_P_UINT16}, /* Number of Gres available */ + {"CPUs", S_P_STRING}, /* CPUs to bind to Gres resource */ + {"File", S_P_STRING}, /* Path to Gres device */ + {NULL} + }; + + s_p_hashtbl_t *tbl; + gres_conf_t *p; + + tbl = s_p_hashtbl_create(_gres_options); + s_p_parse_line(tbl, *leftover, leftover); + + p = xmalloc(sizeof(gres_conf_t)); + p->name = xstrdup(value); + if (!s_p_get_uint16(&p->count, "Count", tbl)) + p->count = 1; + s_p_get_string(&p->cpus, "CPUs", tbl); + s_p_get_string(&p->file, "File", tbl); + + s_p_hashtbl_destroy(tbl); + *dest = (void *)p; + + return 1; +} + /* * Load this node's gres configuration (i.e. how many resources it has) */ extern int gres_plugin_node_config_load(void) { - int i, rc; + static s_p_options_t _gres_options[] = { + {"Name", S_P_ARRAY, _parse_gres_config, NULL}, + {NULL} + }; + + int count, i, rc; + struct stat config_stat; + s_p_hashtbl_t *tbl; + gres_conf_t **gres_array; + List gres_conf_list = NULL; + char *gres_conf_file = _get_gres_conf(); + + if (stat(gres_conf_file, &config_stat) < 0) + fatal("can't stat gres.conf file %s: %m", gres_conf_file); + tbl = s_p_hashtbl_create(_gres_options); + if (s_p_parse_file(tbl, NULL, gres_conf_file) == SLURM_ERROR) + fatal("error opening/reading %s", gres_conf_file); + gres_conf_list = list_create(_destroy_gres_conf); + if (gres_conf_list == NULL) + fatal("list_create: malloc failure"); + if (s_p_get_array((void ***) &gres_array, &count, "Name", tbl)) { + for (i = 0; i < count; i++) { + list_append(gres_conf_list, gres_array[i]); + gres_array[i] = NULL; + } + } + s_p_hashtbl_destroy(tbl); + list_for_each(gres_conf_list, _log_gres_conf, NULL); rc = gres_plugin_init(); slurm_mutex_lock(&gres_context_lock); for (i=0; ((i < gres_context_cnt) && (rc == SLURM_SUCCESS)); i++) { - rc = (*(gres_context[i].ops.node_config_load))(); + rc = (*(gres_context[i].ops.node_config_load))(gres_conf_list); } slurm_mutex_unlock(&gres_context_lock); + list_destroy(gres_conf_list); + xfree(gres_conf_file); return rc; } diff --git a/src/common/gres.h b/src/common/gres.h index 469b96f4837..171c49ebec8 100644 --- a/src/common/gres.h +++ b/src/common/gres.h @@ -42,6 +42,13 @@ #include <slurm/slurm.h> #include "src/common/pack.h" +typedef struct gres_conf { + uint16_t count; + char *cpus; + char *file; + char *name; +} gres_conf_t; + /* * Initialize the gres plugin. * diff --git a/src/plugins/gres/gpu/gres_gpu.c b/src/plugins/gres/gpu/gres_gpu.c index 4337ff801f8..d9ec1125f12 100644 --- a/src/plugins/gres/gpu/gres_gpu.c +++ b/src/plugins/gres/gpu/gres_gpu.c @@ -73,6 +73,7 @@ #include "src/common/slurm_xlator.h" #include "src/common/bitstring.h" +#include "src/common/gres.h" #include "src/common/list.h" /* @@ -205,103 +206,33 @@ extern int help_msg(char *msg, int msg_size) return SLURM_SUCCESS; } -#ifdef HAVE_HWLOC_PCI -/* Get a count of GPUs in this part of the topology based upon name. - * See gpu_names above. */ -static void _gpu_cnt(hwloc_topology_t topology, hwloc_obj_t obj, List gres_list) +/* Load configuration based upon gres.conf */ +extern int node_config_load(List gres_conf_list) { - int i; - - if (obj->complete_cpuset) { - obj->userdata = obj->complete_cpuset; - } else if (obj->parent) { - if (obj->parent->complete_cpuset) - obj->userdata = obj->parent->complete_cpuset; - else - obj->userdata = obj->parent->userdata; - - } - - if (obj->type == HWLOC_OBJ_PCI_DEVICE) { - for (i=0; gpu_names[i]; i++) { - if (strcmp(obj->name, gpu_names[i])) - continue; - list_enqueue(gres_list, obj); - break; - } - } - - for (i = 0; i < obj->arity; i++) - _gpu_cnt(topology, obj->children[i], gres_list); -} - -#if 0 -/* Convert a hwloc CPUSET bitmap into a SLURM bitmap */ -static bitstr_t *_make_slurm_bitmap(hwloc_obj_t obj, int cpu_cnt) -{ - bitstr_t *gres_bitmask; - int i; - - gres_bitmask = bit_alloc(cpu_cnt); - if (gres_bitmask == NULL) - fatal("bit_alloc malloc failure"); - for (i=0; i<cpu_cnt; i++) { - if (hwloc_cpuset_isset((hwloc_const_cpuset_t) obj->userdata, i)) - bit_set(gres_bitmask, i); + int rc = SLURM_ERROR; + ListIterator iter; + gres_conf_t *gres_conf; + + xassert(gres_conf_list); + gres_config.loaded = false; + gres_config.gpu_cnt = 0; + + iter = list_iterator_create(gres_conf_list); + if (iter == NULL) + fatal("list_iterator_create: malloc failure"); + while ((gres_conf = list_next(iter))) { + if (strcmp(gres_conf->name, "gpu")) + continue; + gres_config.loaded = true; + gres_config.gpu_cnt += gres_conf->count; + rc = SLURM_SUCCESS; } - return gres_bitmask; -} -#endif - -/* - * Get the current configuration of this resource (e.g. how many exist, - * their topology and any other required information). - * Called only by slurmd. - */ -extern int node_config_load(void) -{ - hwloc_topology_t topology = (hwloc_topology_t) NULL; - hwloc_obj_t root_obj; - int cpu_cnt, rc = SLURM_SUCCESS; - List gres_obj_list = NULL; + list_iterator_destroy(iter); - _purge_old_node_config(); - if (hwloc_topology_init(&topology) != 0) { - rc = SLURM_ERROR; - goto fini; - } - hwloc_topology_ignore_type(topology, HWLOC_OBJ_BRIDGE); - hwloc_topology_ignore_type(topology, HWLOC_OBJ_CACHE); - hwloc_topology_ignore_type(topology, HWLOC_OBJ_GROUP); - hwloc_topology_ignore_type(topology, HWLOC_OBJ_MISC); - hwloc_topology_ignore_type(topology, HWLOC_OBJ_OS_DEVICE); - if (hwloc_topology_load(topology) != 0) { - rc = SLURM_ERROR; - goto fini; - } - gres_obj_list = list_create(NULL); - root_obj = hwloc_get_root_obj(topology); - cpu_cnt = hwloc_cpuset_last(root_obj->complete_cpuset) + 1; - _gpu_cnt(topology, root_obj, gres_obj_list); - gres_config.gpu_cnt = list_count(gres_obj_list); - /* Extract topology information later */ - list_destroy(gres_obj_list); - -fini: if (topology) - hwloc_topology_destroy(topology); - if (rc == SLURM_SUCCESS) - gres_config.loaded = true; + if (rc != SLURM_SUCCESS) + error("%s failed to load configuration", plugin_name); return rc; } -#else -/* We lack a mechanism for getting the resource count on this node */ -extern int node_config_load(void) -{ - gres_config.loaded = true; - gres_config.gpu_cnt = 0; - return SLURM_SUCCESS; -} -#endif /* * Pack this node's current configuration. @@ -317,7 +248,7 @@ extern int node_config_pack(Buf buffer) pack32(plugin_version, buffer); if (!gres_config.loaded) - rc = node_config_load(); + fatal("%s failed to load configuration", plugin_name); /* Pack whatever node information is relevant to the slurmctld, * including topology. */ diff --git a/src/plugins/gres/nic/gres_nic.c b/src/plugins/gres/nic/gres_nic.c index adeb729c3ce..0ab6e6961f9 100644 --- a/src/plugins/gres/nic/gres_nic.c +++ b/src/plugins/gres/nic/gres_nic.c @@ -73,6 +73,7 @@ #include "src/common/slurm_xlator.h" #include "src/common/bitstring.h" +#include "src/common/gres.h" #include "src/common/list.h" /* @@ -205,103 +206,33 @@ extern int help_msg(char *msg, int msg_size) return SLURM_SUCCESS; } -#ifdef HAVE_HWLOC_PCI -/* Get a count of NICs in this part of the topology based upon name. - * See nic_names above. */ -static void _nic_cnt(hwloc_topology_t topology, hwloc_obj_t obj, List gres_list) +/* Load configuration based upon gres.conf */ +extern int node_config_load(List gres_conf_list) { - int i; - - if (obj->complete_cpuset) { - obj->userdata = obj->complete_cpuset; - } else if (obj->parent) { - if (obj->parent->complete_cpuset) - obj->userdata = obj->parent->complete_cpuset; - else - obj->userdata = obj->parent->userdata; - - } - - if (obj->type == HWLOC_OBJ_PCI_DEVICE) { - for (i=0; nic_names[i]; i++) { - if (strcmp(obj->name, nic_names[i])) - continue; - list_enqueue(gres_list, obj); - break; - } - } - - for (i = 0; i < obj->arity; i++) - _nic_cnt(topology, obj->children[i], gres_list); -} - -#if 0 -/* Convert a hwloc CPUSET bitmap into a SLURM bitmap */ -static bitstr_t *_make_slurm_bitmap(hwloc_obj_t obj, int cpu_cnt) -{ - bitstr_t *gres_bitmask; - int i; - - gres_bitmask = bit_alloc(cpu_cnt); - if (gres_bitmask == NULL) - fatal("bit_alloc malloc failure"); - for (i=0; i<cpu_cnt; i++) { - if (hwloc_cpuset_isset((hwloc_const_cpuset_t) obj->userdata, i)) - bit_set(gres_bitmask, i); + int rc = SLURM_ERROR; + ListIterator iter; + gres_conf_t *gres_conf; + + xassert(gres_conf_list); + gres_config.loaded = false; + gres_config.nic_cnt = 0; + + iter = list_iterator_create(gres_conf_list); + if (iter == NULL) + fatal("list_iterator_create: malloc failure"); + while ((gres_conf = list_next(iter))) { + if (strcmp(gres_conf->name, "nic")) + continue; + gres_config.loaded = true; + gres_config.nic_cnt += gres_conf->count; + rc = SLURM_SUCCESS; } - return gres_bitmask; -} -#endif - -/* - * Get the current configuration of this resource (e.g. how many exist, - * their topology and any other required information). - * Called only by slurmd. - */ -extern int node_config_load(void) -{ - hwloc_topology_t topology = (hwloc_topology_t) NULL; - hwloc_obj_t root_obj; - int cpu_cnt, rc = SLURM_SUCCESS; - List gres_obj_list = NULL; + list_iterator_destroy(iter); - _purge_old_node_config(); - if (hwloc_topology_init(&topology) != 0) { - rc = SLURM_ERROR; - goto fini; - } - hwloc_topology_ignore_type(topology, HWLOC_OBJ_BRIDGE); - hwloc_topology_ignore_type(topology, HWLOC_OBJ_CACHE); - hwloc_topology_ignore_type(topology, HWLOC_OBJ_GROUP); - hwloc_topology_ignore_type(topology, HWLOC_OBJ_MISC); - hwloc_topology_ignore_type(topology, HWLOC_OBJ_OS_DEVICE); - if (hwloc_topology_load(topology) != 0) { - rc = SLURM_ERROR; - goto fini; - } - gres_obj_list = list_create(NULL); - root_obj = hwloc_get_root_obj(topology); - cpu_cnt = hwloc_cpuset_last(root_obj->complete_cpuset) + 1; - _nic_cnt(topology, root_obj, gres_obj_list); - gres_config.nic_cnt = list_count(gres_obj_list); - /* Extract topology information later */ - list_destroy(gres_obj_list); - -fini: if (topology) - hwloc_topology_destroy(topology); - if (rc == SLURM_SUCCESS) - gres_config.loaded = true; + if (rc != SLURM_SUCCESS) + error("%s failed to load configuration", plugin_name); return rc; } -#else -/* We lack a mechanism for getting the resource count on this node */ -extern int node_config_load(void) -{ - gres_config.loaded = true; - gres_config.nic_cnt = 0; - return SLURM_SUCCESS; -} -#endif /* * Pack this node's current configuration. @@ -317,7 +248,7 @@ extern int node_config_pack(Buf buffer) pack32(plugin_version, buffer); if (!gres_config.loaded) - rc = node_config_load(); + fatal("%s failed to load configuration", plugin_name); /* Pack whatever node information is relevant to the slurmctld, * including topology. */ -- GitLab