From 5ecf0584aa93ad611b2b5093f02f19e17b23c3f0 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 27 Aug 2010 16:55:02 +0000
Subject: [PATCH] Add support for batch jobs to set gpu gres env var Update
 docs and fix minor bug in step gpu gres env var

---
 doc/html/gres.shtml                | 46 +++++++++++++++++-----
 doc/man/man5/gres.conf.5           |  2 +-
 src/common/gres.c                  | 61 ++++++++++++++++++++++++++----
 src/common/gres.h                  |  8 +++-
 src/plugins/gres/gpu/gres_gpu.c    | 37 +++++++++++++++++-
 src/plugins/gres/nic/gres_nic.c    |  9 +++++
 src/slurmd/slurmstepd/slurmstepd.c |  6 ++-
 7 files changed, 148 insertions(+), 21 deletions(-)

diff --git a/doc/html/gres.shtml b/doc/html/gres.shtml
index 1dd7e1d4352..9b0a9746dd7 100644
--- a/doc/html/gres.shtml
+++ b/doc/html/gres.shtml
@@ -86,16 +86,15 @@ requested at job submit time using the <I>--gres</I> option supported by
 the <I>salloc</I>, <I>sbatch</I> and <I>srun</I> commands. The option
 requires an argument specifying which generic resources are required and
 how many resources. The resource specification is of the form
-<I>name[:count[</I>*cpu<I>]]</I>. The <I>name</I> is the same name as
+<I>name[:count]</I>. The <I>name</I> is the same name as
 specified by the <I>GresPlugins</I> and <I>Gres</I> configuration parameters.
 <I>count</I> specifies how many resources are required and has a default
-value of 1. <I>*cpu</I> indicates that the resource count specified is per
-allocated CPU, otherwise the count is per allocated node. For example:
-<I>sbatch --gres=gpu:2*cpu,nic:1 ...</I>.</P>
+value of 1. For example:<BR> 
+<I>sbatch --gres=gpu:2 ...</I>.</P>
 
 <P>Jobs will be allocated specific generic resources as needed to satisfy
-the request. If the job is suspended, those resources typically become available
-for use by other jobs (plugin dependent).</P>
+the request. If the job is suspended, those resources do not become available
+for use by other jobs.</P>
 
 <P>Job steps can be allocated generic resources from those allocated to the
 job using the <I>--gres</I> option with the <I>srun</I> command as described
@@ -109,12 +108,41 @@ simple example is shown below.</P>
 #!/bin/bash
 #
 # gres_test.bash
-# Submit as follows: sbatch --gres=gpu:2 -n4 -N1-1 gres_test.bash
+# Submit as follows:
+# sbatch --gres=gpu:4 -n4 -N1-1 gres_test.bash
 #
-srun --gres=gpu:1 -n2 --exclusive job_1 &
-srun --gres=gpu:1 -n2 --exclusive job_2 &
+srun --gres=gpu:2 -n2 --exclusive show_device.sh &
+srun --gres=gpu:1 -n1 --exclusive show_device.sh &
+srun --gres=gpu:1 -n1 --exclusive show_device.sh &
 wait
 </PRE>
+
+<!-------------------------------------------------------------------------->
+<h2>GPU Management</h2>
+
+<P>In the case of SLURM's GRES plugin for GPUs, the environment variable
+CUDA_VISIBLE_DEVICES is set for each job steps to determine which GPUs are
+available for its use. CUDA version 3.1 (or higher) uses this environment
+variable in order to run multiple jobs or job steps on a node with GPUs
+and insure that the resources assigned to each are unique. In the example
+above, the allocated node may have four or more graphics devices. In that
+case, CUDA_VISIBLE_DEVICES will reference unique devices for each file and
+the output might resemble this:</P>
+
+<PRE>
+JobStep=1234.0 CUDA_VISIBLE_DEVICES=0,1
+JobStep=1234.1 CUDA_VISIBLE_DEVICES=2
+JobStep=1234.2 CUDA_VISIBLE_DEVICES=3
+</PRE>
+
+<P>NOTE: Be sure to specify the <I>File</I> parameters in the <I>gres.conf</I>
+file and insure they are in the increasing numeric order.</P>
+<!-------------------------------------------------------------------------->
+<h2>Future</h2>
+
+<P>Our plans for the near future call for integrating SLURM GRES support with
+Linux <I>cgroups</I> in order to remove access to devices not allocated to
+a job or job step.</P>
 <!-------------------------------------------------------------------------->
 
 <p style="text-align: center;">Last modified 29 August 2010</p>
diff --git a/doc/man/man5/gres.conf.5 b/doc/man/man5/gres.conf.5
index bb0826bf8ce..301fcb0241d 100644
--- a/doc/man/man5/gres.conf.5
+++ b/doc/man/man5/gres.conf.5
@@ -90,7 +90,7 @@ Name=gpu File=/dev/nvidia2 CPUs=2,3
 .br
 Name=gpu File=/dev/nvidia3 CPUs=2,3
 .br
-Name=bandwidth Count=20
+Name=bandwidth Count=20M
 
 .SH "COPYING"
 Copyright (C) 2010 The Regents of the University of California.
diff --git a/src/common/gres.c b/src/common/gres.c
index cca6b94cd52..903f425e57e 100644
--- a/src/common/gres.c
+++ b/src/common/gres.c
@@ -87,6 +87,8 @@
 /* Gres symbols provided by the plugin */
 typedef struct slurm_gres_ops {
 	int		(*node_config_load)	( List gres_conf_list );
+	void		(*job_set_env)		( char ***job_env_ptr,
+						  void *gres_ptr );
 	void		(*step_set_env)		( char ***job_env_ptr,
 						  void *gres_ptr );
 } slurm_gres_ops_t;
@@ -225,6 +227,7 @@ static int _load_gres_plugin(char *plugin_name,
 	 */
 	static const char *syms[] = {
 		"node_config_load",
+		"job_set_env",
 		"step_set_env",
 	};
 	int n_syms = sizeof(syms) / sizeof(char *);
@@ -2850,6 +2853,44 @@ extern int gres_plugin_job_dealloc(List job_gres_list, List node_gres_list,
 	return rc;
 }
 
+/*
+ * Set environment variables as required for a batch job
+ * IN/OUT job_env_ptr - environment variable array
+ * IN gres_list - generated by gres_plugin_job_alloc()
+  */
+extern void gres_plugin_job_set_env(char ***job_env_ptr, List job_gres_list)
+{
+	int i;
+	ListIterator gres_iter;
+	gres_state_t *gres_ptr = NULL;
+
+	(void) gres_plugin_init();
+
+	slurm_mutex_lock(&gres_context_lock);
+	for (i=0; i<gres_context_cnt; i++) {
+		if (gres_context[i].ops.job_set_env == NULL)
+			continue;	/* No plugin to call */
+		if (job_gres_list) {
+			gres_iter = list_iterator_create(job_gres_list);
+			while ((gres_ptr = (gres_state_t *)
+					   list_next(gres_iter))) {
+				if (gres_ptr->plugin_id !=
+				    gres_context[i].plugin_id)
+					continue;
+				(*(gres_context[i].ops.job_set_env))
+					(job_env_ptr, gres_ptr->gres_data);
+				break;
+			}
+			list_iterator_destroy(gres_iter);
+		}
+		if (gres_ptr == NULL) { /* No data found */
+			(*(gres_context[i].ops.job_set_env))
+					(job_env_ptr, NULL);
+		}
+	}
+	slurm_mutex_unlock(&gres_context_lock);
+}
+
 static void _job_state_log(void *gres_data, uint32_t job_id, char *gres_name)
 {
 	gres_job_state_t *gres_ptr;
@@ -3394,7 +3435,7 @@ extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list)
 {
 	int i;
 	ListIterator gres_iter;
-	gres_state_t *gres_ptr;
+	gres_state_t *gres_ptr = NULL;
 
 	(void) gres_plugin_init();
 
@@ -3402,19 +3443,23 @@ extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list)
 	for (i=0; i<gres_context_cnt; i++) {
 		if (gres_context[i].ops.step_set_env == NULL)
 			continue;	/* No plugin to call */
-		gres_iter = list_iterator_create(step_gres_list);
-		while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
-			if (gres_ptr->plugin_id != gres_context[i].plugin_id)
-				continue;
-			(*(gres_context[i].ops.step_set_env))
+		if (step_gres_list) {
+			gres_iter = list_iterator_create(step_gres_list);
+			while ((gres_ptr = (gres_state_t *)
+					   list_next(gres_iter))) {
+				if (gres_ptr->plugin_id !=
+				    gres_context[i].plugin_id)
+					continue;
+				(*(gres_context[i].ops.step_set_env))
 					(job_env_ptr, gres_ptr->gres_data);
-			break;
+				break;
+			}
+			list_iterator_destroy(gres_iter);
 		}
 		if (gres_ptr == NULL) { /* No data found */
 			(*(gres_context[i].ops.step_set_env))
 					(job_env_ptr, NULL);
 		}
-		list_iterator_destroy(gres_iter);
 	}
 	slurm_mutex_unlock(&gres_context_lock);
 }
diff --git a/src/common/gres.h b/src/common/gres.h
index f16f538360a..ec9b2ac6412 100644
--- a/src/common/gres.h
+++ b/src/common/gres.h
@@ -370,6 +370,13 @@ extern int gres_plugin_job_dealloc(List job_gres_list, List node_gres_list,
 				   int node_offset, uint32_t job_id,
 				   char *node_name);
 
+/*
+ * Set environment variables as required for a batch job
+ * IN/OUT job_env_ptr - environment variable array
+ * IN gres_list - generated by gres_plugin_job_alloc()
+  */
+extern void gres_plugin_job_set_env(char ***job_env_ptr, List job_gres_list);
+
 /*
  * Log a job's current gres state
  * IN gres_list - generated by gres_plugin_job_state_validate()
@@ -433,7 +440,6 @@ extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer,
   */
 extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list);
 
-
 /*
  * Log a step's current gres state
  * IN gres_list - generated by gres_plugin_step_allocate()
diff --git a/src/plugins/gres/gpu/gres_gpu.c b/src/plugins/gres/gpu/gres_gpu.c
index 7eb59bb1817..56168ac99fc 100644
--- a/src/plugins/gres/gpu/gres_gpu.c
+++ b/src/plugins/gres/gpu/gres_gpu.c
@@ -136,6 +136,40 @@ extern int node_config_load(List gres_conf_list)
 	return rc;
 }
 
+/*
+ * Set environment variables as appropriate for a job (i.e. all tasks) based
+ * upon the job's GRES state.
+ */
+extern void job_set_env(char ***job_env_ptr, void *gres_ptr)
+{
+	int i, len;
+	char *dev_list = NULL;
+	gres_job_state_t *gres_job_ptr = (gres_job_state_t *) gres_ptr;
+
+	if ((gres_job_ptr != NULL) &&
+	    (gres_job_ptr->node_cnt == 1) &&
+	    (gres_job_ptr->gres_bit_alloc != NULL) &&
+	    (gres_job_ptr->gres_bit_alloc[0] != NULL)) {
+		len = bit_size(gres_job_ptr->gres_bit_alloc[0]);
+		for (i=0; i<len; i++) {
+			if (!bit_test(gres_job_ptr->gres_bit_alloc[0], i))
+				continue;
+			if (!dev_list)
+				dev_list = xmalloc(128);
+			else
+				xstrcat(dev_list, ",");
+			xstrfmtcat(dev_list, "%d", i);
+		}
+	}
+	if (dev_list) {
+		env_array_overwrite(job_env_ptr,"CUDA_VISIBLE_DEVICES",
+				    dev_list);
+		xfree(dev_list);
+	} else {
+		env_array_overwrite(job_env_ptr,"CUDA_VISIBLE_DEVICES", "");
+	}
+}
+
 /*
  * Set environment variables as appropriate for a job (i.e. all tasks) based
  * upon the job step's GRES state.
@@ -146,7 +180,8 @@ extern void step_set_env(char ***job_env_ptr, void *gres_ptr)
 	char *dev_list = NULL;
 	gres_step_state_t *gres_step_ptr = (gres_step_state_t *) gres_ptr;
 
-	if ((gres_step_ptr->node_cnt == 1) &&
+	if ((gres_step_ptr != NULL) &&
+	    (gres_step_ptr->node_cnt == 1) &&
 	    (gres_step_ptr->gres_bit_alloc != NULL) &&
 	    (gres_step_ptr->gres_bit_alloc[0] != NULL)) {
 		len = bit_size(gres_step_ptr->gres_bit_alloc[0]);
diff --git a/src/plugins/gres/nic/gres_nic.c b/src/plugins/gres/nic/gres_nic.c
index 89b1dd9cede..da120712d6e 100644
--- a/src/plugins/gres/nic/gres_nic.c
+++ b/src/plugins/gres/nic/gres_nic.c
@@ -134,6 +134,15 @@ extern int node_config_load(List gres_conf_list)
 	return rc;
 }
 
+/*
+ * Set environment variables as appropriate for a job (i.e. all tasks) based
+ * upon the job's GRES state.
+ */
+extern void job_set_env(char ***job_env_ptr, void *gres_ptr)
+{
+	/* EMPTY */
+}
+
 /*
  * Set environment variables as appropriate for a job (i.e. all tasks) based
  * upon the job step's GRES state.
diff --git a/src/slurmd/slurmstepd/slurmstepd.c b/src/slurmd/slurmstepd/slurmstepd.c
index ca8358ce12a..5ef4041b9e2 100644
--- a/src/slurmd/slurmstepd/slurmstepd.c
+++ b/src/slurmd/slurmstepd/slurmstepd.c
@@ -381,12 +381,16 @@ _step_setup(slurm_addr_t *cli, slurm_addr_t *self, slurm_msg_t *msg)
 	job->jmgr_pid = getpid();
 	job->jobacct = jobacct_gather_g_create(NULL);
 
+	/* Establish GRES environment variables */
 	if (conf->debug_flags & DEBUG_FLAG_GRES) {
 		gres_plugin_job_state_log(job->job_gres_list, job->jobid);
 		gres_plugin_step_state_log(job->step_gres_list, job->jobid,
 					   job->stepid);
 	}
-	gres_plugin_step_set_env(&job->env, job->step_gres_list);
+	if (msg->msg_type == REQUEST_BATCH_JOB_LAUNCH)
+		gres_plugin_job_set_env(&job->env, job->job_gres_list);
+	else if (msg->msg_type == REQUEST_LAUNCH_TASKS)
+		gres_plugin_step_set_env(&job->env, job->step_gres_list);
 
 	/*
 	 * Add slurmd node topology informations to job env array
-- 
GitLab