diff --git a/NEWS b/NEWS index c53d033ace9a2dc2dd9b10c148d07a15bf162421..23910dac6d544c4e70a7e4b634bfaee00e0ee7b4 100644 --- a/NEWS +++ b/NEWS @@ -60,6 +60,9 @@ documents those changes that are of interest to users and administrators. -- Fix issue with association hash not getting the correct index which could result in seg fault. -- Fix salloc/sbatch -B segfault. + -- Avoid huge malloc if GRES configured with "Type" and huge "Count". + -- Fix jobs from starting in overlapping reservations that won't finish before + a "maint" reservation begins. * Changes in Slurm 14.11.1 ========================== @@ -417,6 +420,8 @@ documents those changes that are of interest to users and administrators. -- sview - Fix displaying of suspended steps elapsed times. -- Increase number of messages that get cached before throwing them away when the DBD is down. + -- Fix jobs from starting in overlapping reservations that won't finish before + a "maint" reservation begins. * Changes in Slurm 14.03.10 =========================== diff --git a/doc/html/gres.shtml b/doc/html/gres.shtml index 43d73c026e6694160f4001df3692265fbb913d1d..d52b5cb0bbbe5c8b90ff00c555c3477182e594bb 100644 --- a/doc/html/gres.shtml +++ b/doc/html/gres.shtml @@ -101,6 +101,7 @@ file.</LI> <LI><B>Type</B> Optionally specify the device type. For example, this might be used to identify a specific model of GPU, which users can then specify in their job request. +If <B>Type</B> is specified, then <B>Count</B> is limited in size (currently 1024). NOTE: This is a new capability added in Slurm version 14.11.</LI> </UL> @@ -206,6 +207,6 @@ to a physical device</pre> explicitly defined in the offload pragmas.</P> <!--------------------------------------------------------------------------> -<p style="text-align: center;">Last modified 10 April 2014</p> +<p style="text-align: center;">Last modified 4 December 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man5/gres.conf.5 b/doc/man/man5/gres.conf.5 index d928f8543daaf87bf7a4859a2d5f0de6f5cb5fb1..f8b2a0002f86201bc0c0518fdb0d2b0bb039ffb7 100644 --- a/doc/man/man5/gres.conf.5 +++ b/doc/man/man5/gres.conf.5 @@ -104,6 +104,7 @@ the example below. \fBType\fR An arbitrary string identifying the type of device. For example, a particular model of GPU. +If \fBType\fR is specified, then \fBCount\fR is limited in size (currently 1024). .SH "EXAMPLES" .LP diff --git a/src/common/gres.c b/src/common/gres.c index c28918bbbe003558218a1d1fddee051bf6197db1..b0ca7c34cc3131f53771c2fcfb0c767920658bb2 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -86,6 +86,7 @@ #include "src/common/xstring.h" #define GRES_MAGIC 0x438a34d4 +#define MAX_GRES_BITMAP 1024 /* Gres symbols provided by the plugin */ typedef struct slurm_gres_ops { @@ -1098,15 +1099,15 @@ extern int gres_plugin_node_config_unpack(Buf buffer, char* node_name) tmp_name, node_name); has_file = 1; } - if (has_file && (count > 1024)) { + if (has_file && (count > MAX_GRES_BITMAP)) { /* Avoid over-subscribing memory with * huge bitmaps */ - error("gres_plugin_node_config_unpack: " - "gres/%s has File plus very " + error("%s: gres/%s has File plus very " "large Count (%u) for node %s, " - "resetting value to 1024", - tmp_name, count, node_name); - count = 1024; + "resetting value to %d", + __func__, tmp_name, count, + node_name, MAX_GRES_BITMAP); + count = MAX_GRES_BITMAP; } if (has_file) /* Don't clear if already set */ gres_context[j].has_file = has_file; @@ -1702,7 +1703,7 @@ extern int _node_config_validate(char *node_name, char *orig_config, } else if (cpus_config) { error("%s: has CPUs configured for only" " some of the records on node %s", - context_ptr->gres_type,node_name); + context_ptr->gres_type, node_name); } gres_data->topo_gres_bitmap[i] = bit_alloc(gres_cnt); for (j = 0; j < gres_slurmd_conf->count; j++) { @@ -1734,6 +1735,14 @@ extern int _node_config_validate(char *node_name, char *orig_config, gres_data->gres_cnt_avail = 0; if (context_ptr->has_file) { + if (gres_data->gres_cnt_avail > MAX_GRES_BITMAP) { + error("%s: gres/%s has File plus very large Count (%u) " + "for node %s, resetting value to %u", + __func__, context_ptr->gres_type, + gres_data->gres_cnt_avail, node_name, + MAX_GRES_BITMAP); + gres_data->gres_cnt_avail = MAX_GRES_BITMAP; + } if (gres_data->gres_bit_alloc == NULL) { gres_data->gres_bit_alloc = bit_alloc(gres_data->gres_cnt_avail); diff --git a/src/slurmctld/reservation.c b/src/slurmctld/reservation.c index 54f821c3e09cf20a2832e60df6040233d6bbcba2..87957f1b0176c1fb4c1b8d07b347c3e7bba5a072 100644 --- a/src/slurmctld/reservation.c +++ b/src/slurmctld/reservation.c @@ -4125,7 +4125,8 @@ extern int job_test_resv(struct job_record *job_ptr, time_t *when, iter = list_iterator_create(resv_list); while ((res2_ptr = (slurmctld_resv_t *) list_next(iter))) { if ((resv_ptr->flags & RESERVE_FLAG_MAINT) || - (resv_ptr->flags & RESERVE_FLAG_OVERLAP) || + ((resv_ptr->flags & RESERVE_FLAG_OVERLAP) && + !(res2_ptr->flags & RESERVE_FLAG_MAINT)) || (res2_ptr == resv_ptr) || (res2_ptr->node_bitmap == NULL) || (res2_ptr->start_time >= job_end_time) ||