diff --git a/doc/html/gres.shtml b/doc/html/gres.shtml index f0dcfacba0fa1f57c3b65a8331a128d0c2661a26..0c6f2632a202cd35f0282472bfea37c22824bf86 100644 --- a/doc/html/gres.shtml +++ b/doc/html/gres.shtml @@ -127,7 +127,11 @@ NOTE: If you specify the <B>File</B> parameter for a resource on some node, the option must be specified on all nodes and Slurm will track the assignment of each specific resource on each node. Otherwise Slurm will only track a count of allocated resources rather than the state of each individual device -file.</LI> +file. +<br> +NOTE: Drain a node before changing the count of records with <B>File</B> +parameters (i.e. if you want to add or remove GPUs from a node's configuration). +Failure to do so will result in any job using those GRES being aborted.</LI> <LI><B>Type</B> Optionally specify the device type. For example, this might be used to identify a specific model of GPU, which users can then specify @@ -347,6 +351,6 @@ to a physical device</pre> explicitly defined in the offload pragmas.</P> <!--------------------------------------------------------------------------> -<p style="text-align: center;">Last modified 21 December 2018</p> +<p style="text-align: center;">Last modified 31 December 2018</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man5/gres.conf.5 b/doc/man/man5/gres.conf.5 index c757d5a0107d4437195b7b985f8b8b15d322b6c3..504113e78a40ed2506b0400cbbcd60ce3e04805a 100644 --- a/doc/man/man5/gres.conf.5 +++ b/doc/man/man5/gres.conf.5 @@ -134,6 +134,10 @@ of each specific resource on each node. Otherwise Slurm will only track a count of allocated resources rather than the state of each individual device file. +NOTE: Drain a node before changing the count of records with \fBFile\fR +parameters (i.e. if you want to add or remove GPUs from a node's configuration). +Failure to do so will result in any job using those GRES being aborted. + .TP \fBLinks\fR A comma\-delimited list of numbers identifying the number of connections diff --git a/src/common/gres.c b/src/common/gres.c index 2c3ed88cb6966f80b011d613b9210ed402f3a71f..f8dab46153f650f0cabe6a7c5bc93d56345b49a7 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -278,6 +278,8 @@ static uint64_t _step_test(void *step_gres_data, void *job_gres_data, bool ignore_alloc, char *gres_name, uint32_t job_id, uint32_t step_id, uint32_t plugin_id); +static void _sync_node_mps_to_gpu(gres_state_t *mps_gres_ptr, + gres_state_t *gpu_gres_ptr); static int _unload_gres_plugin(slurm_gres_context_t *plugin_context); static void _validate_config(slurm_gres_context_t *context_ptr); static int _validate_file(char *path_name, char *gres_name); @@ -1846,7 +1848,7 @@ static int _node_config_init(char *node_name, char *orig_config, extern int gres_plugin_init_node_config(char *node_name, char *orig_config, List *gres_list) { - int i, rc; + int i, rc, rc2; ListIterator gres_iter; gres_state_t *gres_ptr; @@ -1856,7 +1858,7 @@ extern int gres_plugin_init_node_config(char *node_name, char *orig_config, if ((gres_context_cnt > 0) && (*gres_list == NULL)) { *gres_list = list_create(_gres_node_list_delete); } - for (i = 0; ((i < gres_context_cnt) && (rc == SLURM_SUCCESS)); i++) { + for (i = 0; i < gres_context_cnt; i++) { /* Find or create gres_state entry on the list */ gres_iter = list_iterator_create(*gres_list); while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) { @@ -1870,8 +1872,10 @@ extern int gres_plugin_init_node_config(char *node_name, char *orig_config, list_append(*gres_list, gres_ptr); } - rc = _node_config_init(node_name, orig_config, - &gres_context[i], gres_ptr); + rc2 = _node_config_init(node_name, orig_config, + &gres_context[i], gres_ptr); + if (rc == SLURM_SUCCESS) + rc = rc2; } slurm_mutex_unlock(&gres_context_lock); @@ -2022,10 +2026,20 @@ static int _node_config_validate(char *node_name, char *orig_config, return rc; gres_cnt = _get_tot_gres_cnt(context_ptr->plugin_id, &set_cnt); + if (gres_data->gres_cnt_config > gres_cnt) { + if (reason_down && (*reason_down == NULL)) { + xstrfmtcat(*reason_down, + "%s count reported lower than configured " + "(%"PRIu64" < %"PRIu64")", + context_ptr->gres_type, + gres_cnt, gres_data->gres_cnt_config); + } + rc = EINVAL; + } if (gres_data->gres_cnt_found != gres_cnt) { if (gres_data->gres_cnt_found != NO_VAL64) { - info("%s: count changed for node %s from %"PRIu64" to %"PRIu64, - context_ptr->gres_type, node_name, + info("%s: %s: \"Count\" changed on node %s (%"PRIu64" != %"PRIu64")", + __func__, context_ptr->gres_type, node_name, gres_data->gres_cnt_found, gres_cnt); } if ((gres_data->gres_cnt_found != NO_VAL64) && @@ -2045,7 +2059,13 @@ static int _node_config_validate(char *node_name, char *orig_config, } if (updated_config == false) return rc; - + if ((gres_data->gres_cnt_config != 0) && + (set_cnt > gres_data->gres_cnt_config)) { + info("%s: %s: \"Count\" on node %s inconsistent with slurmctld count (%"PRIu64" != %"PRIu64")", + __func__, context_ptr->gres_type, node_name, + set_cnt, gres_data->gres_cnt_config); + set_cnt = gres_data->gres_cnt_config; /* Ignore excess GRES */ + } if ((set_cnt == 0) && (set_cnt != gres_data->topo_cnt)) { /* Need to clear topology info */ xfree(gres_data->topo_gres_cnt_alloc); @@ -2160,8 +2180,8 @@ static int _node_config_validate(char *node_name, char *orig_config, bit_or(tot_core_bitmap, gres_data->topo_core_bitmap[i]); } else if (i == 0) { - error("%s: invalid GRES cpu count (%u) on node %s", - context_ptr->gres_type, + error("%s: %s: invalid GRES cpu count (%u) on node %s", + __func__, context_ptr->gres_type, gres_slurmd_conf->cpu_cnt, node_name); } @@ -2169,8 +2189,9 @@ static int _node_config_validate(char *node_name, char *orig_config, cpus_config = core_cnt; } else if (cpus_config && !cpu_config_err) { cpu_config_err = true; - error("%s: has CPUs configured for only some of the records on node %s", - context_ptr->gres_type, node_name); + error("%s: %s: has CPUs configured for only some of the records on node %s", + __func__, context_ptr->gres_type, + node_name); } if (gres_slurmd_conf->links) { @@ -2380,7 +2401,7 @@ extern int gres_plugin_node_config_validate(char *node_name, { int i, rc, rc2; ListIterator gres_iter; - gres_state_t *gres_ptr; + gres_state_t *gres_ptr, *gres_gpu_ptr = NULL, *gres_mps_ptr = NULL; int core_cnt = sock_cnt * cores_per_sock; int cpu_cnt = core_cnt * threads_per_core; @@ -2389,7 +2410,7 @@ extern int gres_plugin_node_config_validate(char *node_name, slurm_mutex_lock(&gres_context_lock); if ((gres_context_cnt > 0) && (*gres_list == NULL)) *gres_list = list_create(_gres_node_list_delete); - for (i = 0; ((i < gres_context_cnt) && (rc == SLURM_SUCCESS)); i++) { + for (i = 0; i < gres_context_cnt; i++) { /* Find or create gres_state entry on the list */ gres_iter = list_iterator_create(*gres_list); while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) { @@ -2407,7 +2428,12 @@ extern int gres_plugin_node_config_validate(char *node_name, sock_cnt, fast_schedule, reason_down, &gres_context[i]); rc = MAX(rc, rc2); + if (gres_ptr->plugin_id == gpu_plugin_id) + gres_gpu_ptr = gres_ptr; + else if (gres_ptr->plugin_id == mps_plugin_id) + gres_mps_ptr = gres_ptr; } + _sync_node_mps_to_gpu(gres_mps_ptr, gres_gpu_ptr); slurm_mutex_unlock(&gres_context_lock); return rc; @@ -2624,6 +2650,7 @@ static void _sync_node_mps_to_gpu(gres_state_t *mps_gres_ptr, if (!gpu_gres_ptr || !mps_gres_ptr) return; + gpu_gres_data = gpu_gres_ptr->gres_data; mps_gres_data = mps_gres_ptr->gres_data; gpu_cnt = gpu_gres_data->gres_cnt_avail; diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 7466f21ff1d38293d2d5e14f2c62abe9eda50eb1..3a174f3b439d23db818e2d194aaccdd1b7360be9 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -10591,6 +10591,20 @@ void reset_job_bitmaps(void) job_ptr); job_fail = true; } + if (!job_fail && + (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) && + gres_plugin_job_revalidate2(job_ptr->job_id, + job_ptr->gres_list, + job_ptr->job_resrcs->node_bitmap)) { + /* + * This can be due to the job being allocated GRES + * which no longer exist (i.e. the GRES count on some + * allocated node changed since when the job started). + */ + error("Aborting %pJ due to use of invalid GRES configuration", + job_ptr); + job_fail = true; + } _reset_step_bitmaps(job_ptr); /* Do not increase the job->node_cnt for completed jobs */ diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 751a9c5350becbb3bfd746990110ce481d17bc5a..aba3b014991b5bc772eb0b0f664d9c6958e9148b 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -1712,14 +1712,12 @@ static void _gres_reconfig(bool reconfig) gres_name = node_ptr->config_ptr->gres; gres_plugin_init_node_config(node_ptr->name, gres_name, &node_ptr->gres_list); - if (!IS_NODE_CLOUD(node_ptr)) - continue; /* - * Load in gres for node now. By default Slurm gets this - * information when the node registers for the first - * time, which can take a while for a node in the cloud - * to boot. + * Load in GRES for node now based upon state save and + * configuration inforation. Get current information + * when the node registers, which can take a while for + * a node in the cloud to boot. */ gres_plugin_node_config_load( node_ptr->config_ptr->cpus, node_ptr->name,