From aa0b388e2a57314ea5ac032863481cedbf3be7eb Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Thu, 10 Apr 2014 10:18:41 -0700 Subject: [PATCH] Parse gres:type spec in slurm.conf --- doc/html/gres.shtml | 21 ++++++++++++++------- doc/man/man5/slurm.conf.5 | 11 ++++++----- src/common/gres.c | 31 +++++++++++++++++++++++-------- 3 files changed, 43 insertions(+), 20 deletions(-) diff --git a/doc/html/gres.shtml b/doc/html/gres.shtml index 8436b9fdd0b..ef3a6f8ead1 100644 --- a/doc/html/gres.shtml +++ b/doc/html/gres.shtml @@ -69,15 +69,20 @@ the option must be specified on all nodes and SLURM will track the assignment of each specific resource on each node. Otherwise SLURM will only track a count of allocated resources rather than the state of each individual device file.</LI> + +<LI><B>Type</B> Optionally specify the device type. For example, this might +be used to identify a specific model of GPU, which users can then specify +in their job request. +NOTE: This is a new capability added in Slurm version 14.11.</LI> </UL> <P>Sample gres.conf file:</P> <PRE> # Configure support for our four GPUs -Name=gpu File=/dev/nvidia0 CPUs=0,1 -Name=gpu File=/dev/nvidia1 CPUs=0,1 -Name=gpu File=/dev/nvidia2 CPUs=2,3 -Name=gpu File=/dev/nvidia3 CPUs=2,3 +Name=gpu Type=tesla File=/dev/nvidia0 CPUs=0,1 +Name=gpu Type=tesla File=/dev/nvidia1 CPUs=0,1 +Name=gpu Type=kepler File=/dev/nvidia2 CPUs=2,3 +Name=gpu Type=kepler File=/dev/nvidia3 CPUs=2,3 Name=bandwidth Count=20M </PRE> <!--------------------------------------------------------------------------> @@ -88,11 +93,13 @@ requested at job submit time using the <I>--gres</I> option supported by the <I>salloc</I>, <I>sbatch</I> and <I>srun</I> commands. The option requires an argument specifying which generic resources are required and how many resources. The resource specification is of the form -<I>name[:count]</I>. The <I>name</I> is the same name as +<I>name[:type:count]</I>. The <I>name</I> is the same name as specified by the <I>GresTypes</I> and <I>Gres</I> configuration parameters. +<I>type</I> identifies a specific type of that generic resource (e.g. a +specific model of GPU). <I>count</I> specifies how many resources are required and has a default value of 1. For example:<BR> -<I>sbatch --gres=gpu:2 ...</I>.</P> +<I>sbatch --gres=gpu:kepler:2 ...</I>.</P> <P>Jobs will be allocated specific generic resources as needed to satisfy the request. If the job is suspended, those resources do not become available @@ -171,6 +178,6 @@ to a physical device</pre> explicitly defined in the offload pragmas.</P> <!--------------------------------------------------------------------------> -<p style="text-align: center;">Last modified 25 October 2012</p> +<p style="text-align: center;">Last modified 10 April 2014</p> </body></html> diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index a80f8b7e9ec..2231816f19d 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1,4 +1,4 @@ -.TH "slurm.conf" "5" "October 2013" "slurm.conf 14.03" "Slurm configuration file" +.TH "slurm.conf" "5" "April 2014" "slurm.conf 14.11" "Slurm configuration file" .SH "NAME" slurm.conf \- Slurm configuration file @@ -2973,13 +2973,14 @@ Also see \fBGres\fR. .TP \fBGres\fR A comma delimited list of generic resources specifications for a node. -Each resource specification consists of a name followed by an optional -colon with a numeric value (default value is one) -(e.g. "Gres=bandwidth:10000,gpu:2"). +Each resource specification consists of a name followed by a colon with a +numeric value (e.g. "Gres=bandwidth:10000,gpu:2"). A suffix of "K", "M" or "G" may be used to multiply the number by 1024, -1048576 or 1073741824 respectively (e.g. "Gres=bandwidth:4G,gpu:4").. +1048576 or 1073741824 respectively (e.g. "Gres=bandwidth:4G,gpu:4"). By default a node has no generic resources and its maximum count is 4,294,967,295. +The generic resource type can optionally be specified using a third colon +separated field after the name (e.g. "Gres=gpu:tesla:1,gpu:kepler:1"). Also see \fBFeature\fR. .TP diff --git a/src/common/gres.c b/src/common/gres.c index e6cf1146f22..56a73729b13 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -1215,11 +1215,21 @@ static void _gres_node_list_delete(void *list_element) xfree(gres_ptr); } +/* + * Compute the total GRES count for a particular gres_name. + * Note that a given gres_name can appear multiple times in the orig_config + * string for multiple types (e.g. "gres=gpu:kepler:1,gpu:tesla:2"). + * IN orig_config - gres configuration from slurm.conf + * IN gres_name - name of the gres type (e.g. "gpu") + * IN gres_name_colon - gres name with appended colon + * IN gres_name_colon_len - size of gres_name_colon + * RET - Total configured count for this GRES type + */ static uint32_t _get_gres_cnt(char *orig_config, char *gres_name, char *gres_name_colon, int gres_name_colon_len) { - char *node_gres_config, *tok, *last_num = NULL, *last_tok = NULL; - uint32_t gres_config_cnt = 0; + char *node_gres_config, *tok, *num, *last_num = NULL, *last_tok = NULL; + uint32_t gres_config_cnt = 0, tmp_gres_cnt = 0; if (orig_config == NULL) return gres_config_cnt; @@ -1232,17 +1242,22 @@ static uint32_t _get_gres_cnt(char *orig_config, char *gres_name, break; } if (!strncmp(tok, gres_name_colon, gres_name_colon_len)) { - tok += gres_name_colon_len; - gres_config_cnt = strtol(tok, &last_num, 10); + num = strrchr(tok, ':'); + if (!num) { + error("Bad GRES configuration: %s", tok); + break; + } + num++; + tmp_gres_cnt = strtol(num, &last_num, 10); if (last_num[0] == '\0') ; else if ((last_num[0] == 'k') || (last_num[0] == 'K')) - gres_config_cnt *= 1024; + tmp_gres_cnt *= 1024; else if ((last_num[0] == 'm') || (last_num[0] == 'M')) - gres_config_cnt *= (1024 * 1024); + tmp_gres_cnt *= (1024 * 1024); else if ((last_num[0] == 'g') || (last_num[0] == 'G')) - gres_config_cnt *= (1024 * 1024 * 1024); - break; + tmp_gres_cnt *= (1024 * 1024 * 1024); + gres_config_cnt += tmp_gres_cnt; } tok = strtok_r(NULL, ",", &last_tok); } -- GitLab