diff --git a/doc/html/gres.shtml b/doc/html/gres.shtml index 1dd7e1d4352581658cac1745803acc9c9562b0bc..9b0a9746dd7074491621bbee0a7425ebfaab18e7 100644 --- a/doc/html/gres.shtml +++ b/doc/html/gres.shtml @@ -86,16 +86,15 @@ requested at job submit time using the <I>--gres</I> option supported by the <I>salloc</I>, <I>sbatch</I> and <I>srun</I> commands. The option requires an argument specifying which generic resources are required and how many resources. The resource specification is of the form -<I>name[:count[</I>*cpu<I>]]</I>. The <I>name</I> is the same name as +<I>name[:count]</I>. The <I>name</I> is the same name as specified by the <I>GresPlugins</I> and <I>Gres</I> configuration parameters. <I>count</I> specifies how many resources are required and has a default -value of 1. <I>*cpu</I> indicates that the resource count specified is per -allocated CPU, otherwise the count is per allocated node. For example: -<I>sbatch --gres=gpu:2*cpu,nic:1 ...</I>.</P> +value of 1. For example:<BR> +<I>sbatch --gres=gpu:2 ...</I>.</P> <P>Jobs will be allocated specific generic resources as needed to satisfy -the request. If the job is suspended, those resources typically become available -for use by other jobs (plugin dependent).</P> +the request. If the job is suspended, those resources do not become available +for use by other jobs.</P> <P>Job steps can be allocated generic resources from those allocated to the job using the <I>--gres</I> option with the <I>srun</I> command as described @@ -109,12 +108,41 @@ simple example is shown below.</P> #!/bin/bash # # gres_test.bash -# Submit as follows: sbatch --gres=gpu:2 -n4 -N1-1 gres_test.bash +# Submit as follows: +# sbatch --gres=gpu:4 -n4 -N1-1 gres_test.bash # -srun --gres=gpu:1 -n2 --exclusive job_1 & -srun --gres=gpu:1 -n2 --exclusive job_2 & +srun --gres=gpu:2 -n2 --exclusive show_device.sh & +srun --gres=gpu:1 -n1 --exclusive show_device.sh & +srun --gres=gpu:1 -n1 --exclusive show_device.sh & wait </PRE> + +<!--------------------------------------------------------------------------> +<h2>GPU Management</h2> + +<P>In the case of SLURM's GRES plugin for GPUs, the environment variable +CUDA_VISIBLE_DEVICES is set for each job steps to determine which GPUs are +available for its use. CUDA version 3.1 (or higher) uses this environment +variable in order to run multiple jobs or job steps on a node with GPUs +and insure that the resources assigned to each are unique. In the example +above, the allocated node may have four or more graphics devices. In that +case, CUDA_VISIBLE_DEVICES will reference unique devices for each file and +the output might resemble this:</P> + +<PRE> +JobStep=1234.0 CUDA_VISIBLE_DEVICES=0,1 +JobStep=1234.1 CUDA_VISIBLE_DEVICES=2 +JobStep=1234.2 CUDA_VISIBLE_DEVICES=3 +</PRE> + +<P>NOTE: Be sure to specify the <I>File</I> parameters in the <I>gres.conf</I> +file and insure they are in the increasing numeric order.</P> +<!--------------------------------------------------------------------------> +<h2>Future</h2> + +<P>Our plans for the near future call for integrating SLURM GRES support with +Linux <I>cgroups</I> in order to remove access to devices not allocated to +a job or job step.</P> <!--------------------------------------------------------------------------> <p style="text-align: center;">Last modified 29 August 2010</p> diff --git a/doc/man/man5/gres.conf.5 b/doc/man/man5/gres.conf.5 index bb0826bf8ce1439b293d2c71490f00ca9a302637..301fcb0241de419e7a2dd5bbf2745333dbbd6b8b 100644 --- a/doc/man/man5/gres.conf.5 +++ b/doc/man/man5/gres.conf.5 @@ -90,7 +90,7 @@ Name=gpu File=/dev/nvidia2 CPUs=2,3 .br Name=gpu File=/dev/nvidia3 CPUs=2,3 .br -Name=bandwidth Count=20 +Name=bandwidth Count=20M .SH "COPYING" Copyright (C) 2010 The Regents of the University of California. diff --git a/src/common/gres.c b/src/common/gres.c index cca6b94cd5297ebddf9e8946fc66711ddfd9ab7f..903f425e57e200b15383860f075fe40534a147ce 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -87,6 +87,8 @@ /* Gres symbols provided by the plugin */ typedef struct slurm_gres_ops { int (*node_config_load) ( List gres_conf_list ); + void (*job_set_env) ( char ***job_env_ptr, + void *gres_ptr ); void (*step_set_env) ( char ***job_env_ptr, void *gres_ptr ); } slurm_gres_ops_t; @@ -225,6 +227,7 @@ static int _load_gres_plugin(char *plugin_name, */ static const char *syms[] = { "node_config_load", + "job_set_env", "step_set_env", }; int n_syms = sizeof(syms) / sizeof(char *); @@ -2850,6 +2853,44 @@ extern int gres_plugin_job_dealloc(List job_gres_list, List node_gres_list, return rc; } +/* + * Set environment variables as required for a batch job + * IN/OUT job_env_ptr - environment variable array + * IN gres_list - generated by gres_plugin_job_alloc() + */ +extern void gres_plugin_job_set_env(char ***job_env_ptr, List job_gres_list) +{ + int i; + ListIterator gres_iter; + gres_state_t *gres_ptr = NULL; + + (void) gres_plugin_init(); + + slurm_mutex_lock(&gres_context_lock); + for (i=0; i<gres_context_cnt; i++) { + if (gres_context[i].ops.job_set_env == NULL) + continue; /* No plugin to call */ + if (job_gres_list) { + gres_iter = list_iterator_create(job_gres_list); + while ((gres_ptr = (gres_state_t *) + list_next(gres_iter))) { + if (gres_ptr->plugin_id != + gres_context[i].plugin_id) + continue; + (*(gres_context[i].ops.job_set_env)) + (job_env_ptr, gres_ptr->gres_data); + break; + } + list_iterator_destroy(gres_iter); + } + if (gres_ptr == NULL) { /* No data found */ + (*(gres_context[i].ops.job_set_env)) + (job_env_ptr, NULL); + } + } + slurm_mutex_unlock(&gres_context_lock); +} + static void _job_state_log(void *gres_data, uint32_t job_id, char *gres_name) { gres_job_state_t *gres_ptr; @@ -3394,7 +3435,7 @@ extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list) { int i; ListIterator gres_iter; - gres_state_t *gres_ptr; + gres_state_t *gres_ptr = NULL; (void) gres_plugin_init(); @@ -3402,19 +3443,23 @@ extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list) for (i=0; i<gres_context_cnt; i++) { if (gres_context[i].ops.step_set_env == NULL) continue; /* No plugin to call */ - gres_iter = list_iterator_create(step_gres_list); - while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) { - if (gres_ptr->plugin_id != gres_context[i].plugin_id) - continue; - (*(gres_context[i].ops.step_set_env)) + if (step_gres_list) { + gres_iter = list_iterator_create(step_gres_list); + while ((gres_ptr = (gres_state_t *) + list_next(gres_iter))) { + if (gres_ptr->plugin_id != + gres_context[i].plugin_id) + continue; + (*(gres_context[i].ops.step_set_env)) (job_env_ptr, gres_ptr->gres_data); - break; + break; + } + list_iterator_destroy(gres_iter); } if (gres_ptr == NULL) { /* No data found */ (*(gres_context[i].ops.step_set_env)) (job_env_ptr, NULL); } - list_iterator_destroy(gres_iter); } slurm_mutex_unlock(&gres_context_lock); } diff --git a/src/common/gres.h b/src/common/gres.h index f16f538360a6b17e634532f70f76b8f2596964f4..ec9b2ac6412a12a4a975c16e45097d3088284466 100644 --- a/src/common/gres.h +++ b/src/common/gres.h @@ -370,6 +370,13 @@ extern int gres_plugin_job_dealloc(List job_gres_list, List node_gres_list, int node_offset, uint32_t job_id, char *node_name); +/* + * Set environment variables as required for a batch job + * IN/OUT job_env_ptr - environment variable array + * IN gres_list - generated by gres_plugin_job_alloc() + */ +extern void gres_plugin_job_set_env(char ***job_env_ptr, List job_gres_list); + /* * Log a job's current gres state * IN gres_list - generated by gres_plugin_job_state_validate() @@ -433,7 +440,6 @@ extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer, */ extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list); - /* * Log a step's current gres state * IN gres_list - generated by gres_plugin_step_allocate() diff --git a/src/plugins/gres/gpu/gres_gpu.c b/src/plugins/gres/gpu/gres_gpu.c index 7eb59bb181768e6dbd142a19f5b85c39984b04f1..56168ac99fc07b14db3d2a270f499c428b72f05c 100644 --- a/src/plugins/gres/gpu/gres_gpu.c +++ b/src/plugins/gres/gpu/gres_gpu.c @@ -136,6 +136,40 @@ extern int node_config_load(List gres_conf_list) return rc; } +/* + * Set environment variables as appropriate for a job (i.e. all tasks) based + * upon the job's GRES state. + */ +extern void job_set_env(char ***job_env_ptr, void *gres_ptr) +{ + int i, len; + char *dev_list = NULL; + gres_job_state_t *gres_job_ptr = (gres_job_state_t *) gres_ptr; + + if ((gres_job_ptr != NULL) && + (gres_job_ptr->node_cnt == 1) && + (gres_job_ptr->gres_bit_alloc != NULL) && + (gres_job_ptr->gres_bit_alloc[0] != NULL)) { + len = bit_size(gres_job_ptr->gres_bit_alloc[0]); + for (i=0; i<len; i++) { + if (!bit_test(gres_job_ptr->gres_bit_alloc[0], i)) + continue; + if (!dev_list) + dev_list = xmalloc(128); + else + xstrcat(dev_list, ","); + xstrfmtcat(dev_list, "%d", i); + } + } + if (dev_list) { + env_array_overwrite(job_env_ptr,"CUDA_VISIBLE_DEVICES", + dev_list); + xfree(dev_list); + } else { + env_array_overwrite(job_env_ptr,"CUDA_VISIBLE_DEVICES", ""); + } +} + /* * Set environment variables as appropriate for a job (i.e. all tasks) based * upon the job step's GRES state. @@ -146,7 +180,8 @@ extern void step_set_env(char ***job_env_ptr, void *gres_ptr) char *dev_list = NULL; gres_step_state_t *gres_step_ptr = (gres_step_state_t *) gres_ptr; - if ((gres_step_ptr->node_cnt == 1) && + if ((gres_step_ptr != NULL) && + (gres_step_ptr->node_cnt == 1) && (gres_step_ptr->gres_bit_alloc != NULL) && (gres_step_ptr->gres_bit_alloc[0] != NULL)) { len = bit_size(gres_step_ptr->gres_bit_alloc[0]); diff --git a/src/plugins/gres/nic/gres_nic.c b/src/plugins/gres/nic/gres_nic.c index 89b1dd9cede98d7610889e062abb67026b052754..da120712d6e504ee00837987d3adad2c51ec9ffe 100644 --- a/src/plugins/gres/nic/gres_nic.c +++ b/src/plugins/gres/nic/gres_nic.c @@ -134,6 +134,15 @@ extern int node_config_load(List gres_conf_list) return rc; } +/* + * Set environment variables as appropriate for a job (i.e. all tasks) based + * upon the job's GRES state. + */ +extern void job_set_env(char ***job_env_ptr, void *gres_ptr) +{ + /* EMPTY */ +} + /* * Set environment variables as appropriate for a job (i.e. all tasks) based * upon the job step's GRES state. diff --git a/src/slurmd/slurmstepd/slurmstepd.c b/src/slurmd/slurmstepd/slurmstepd.c index ca8358ce12ab2e2897ff148fa1d7509113b46ba9..5ef4041b9e281b5d8f53c0c8d85d5cb612bd2fa0 100644 --- a/src/slurmd/slurmstepd/slurmstepd.c +++ b/src/slurmd/slurmstepd/slurmstepd.c @@ -381,12 +381,16 @@ _step_setup(slurm_addr_t *cli, slurm_addr_t *self, slurm_msg_t *msg) job->jmgr_pid = getpid(); job->jobacct = jobacct_gather_g_create(NULL); + /* Establish GRES environment variables */ if (conf->debug_flags & DEBUG_FLAG_GRES) { gres_plugin_job_state_log(job->job_gres_list, job->jobid); gres_plugin_step_state_log(job->step_gres_list, job->jobid, job->stepid); } - gres_plugin_step_set_env(&job->env, job->step_gres_list); + if (msg->msg_type == REQUEST_BATCH_JOB_LAUNCH) + gres_plugin_job_set_env(&job->env, job->job_gres_list); + else if (msg->msg_type == REQUEST_LAUNCH_TASKS) + gres_plugin_step_set_env(&job->env, job->step_gres_list); /* * Add slurmd node topology informations to job env array